Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: train logging #1572

Merged
merged 5 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions deeppavlov/configs/classifiers/paraphraser_rubert.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@
"validation_patience": 7,
"val_every_n_batches": 50,
"log_every_n_batches": 50,
"validate_best": true,
"test_best": true,
"evaluation_targets": ["valid", "test"],
"class_name": "torch_trainer"
},
"metadata": {
Expand Down
27 changes: 2 additions & 25 deletions deeppavlov/core/commands/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def train_evaluate_model_from_config(config: Union[str, Path, dict],
iterator: Union[DataLearningIterator, DataFittingIterator] = None, *,
to_train: bool = True,
evaluation_targets: Optional[Iterable[str]] = None,
to_validate: Optional[bool] = None,
download: bool = False,
start_epoch_num: Optional[int] = None,
recursive: bool = False) -> Dict[str, Dict[str, float]]:
Expand Down Expand Up @@ -98,22 +97,11 @@ def train_evaluate_model_from_config(config: Union[str, Path, dict],

if 'train' not in config:
log.warning('Train config is missing. Populating with default values')
train_config = config.get('train')
train_config = config.get('train', {})

if start_epoch_num is not None:
train_config['start_epoch_num'] = start_epoch_num

if 'evaluation_targets' not in train_config and ('validate_best' in train_config
or 'test_best' in train_config):
log.warning('"validate_best" and "test_best" parameters are deprecated.'
' Please, use "evaluation_targets" list instead')

train_config['evaluation_targets'] = []
if train_config.pop('validate_best', True):
train_config['evaluation_targets'].append('valid')
if train_config.pop('test_best', True):
train_config['evaluation_targets'].append('test')

trainer_class = get_model(train_config.pop('class_name', 'nn_trainer'))
trainer = trainer_class(config['chainer'], **train_config)

Expand All @@ -123,18 +111,7 @@ def train_evaluate_model_from_config(config: Union[str, Path, dict],
res = {}

if iterator is not None:
if to_validate is not None:
if evaluation_targets is None:
log.warning('"to_validate" parameter is deprecated and will be removed in future versions.'
' Please, use "evaluation_targets" list instead')
evaluation_targets = ['test']
if to_validate:
evaluation_targets.append('valid')
else:
log.warning('Both "evaluation_targets" and "to_validate" parameters are specified.'
' "to_validate" is deprecated and will be ignored')

res = trainer.evaluate(iterator, evaluation_targets, print_reports=True)
res = trainer.evaluate(iterator, evaluation_targets)
trainer.get_chainer().destroy()

res = {k: v['metrics'] for k, v in res.items()}
Expand Down
53 changes: 53 additions & 0 deletions deeppavlov/core/common/log_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Optional
from deeppavlov.core.commands.utils import expand_path

log = getLogger(__name__)


class TBWriter:
def __init__(self, tensorboard_log_dir: str):
# TODO: After adding wandb logger, create common parent class for both loggers
from torch.utils.tensorboard import SummaryWriter
tensorboard_log_dir = expand_path(tensorboard_log_dir)
self.tb_train_writer = SummaryWriter(str(tensorboard_log_dir / 'train_log'))
self.tb_valid_writer = SummaryWriter(str(tensorboard_log_dir / 'valid_log'))

# TODO: find how to write Summary
def write_train(self, tag, scalar_value, global_step):
self.tb_train_writer.add_scalar(tag, scalar_value, global_step)

def write_valid(self, tag, scalar_value, global_step):
self.tb_valid_writer.add_scalar(tag, scalar_value, global_step)

def flush(self):
self.tb_train_writer.flush()
self.tb_valid_writer.flush()


def get_tb_writer(tensorboard_log_dir: Optional[str]) -> Optional[TBWriter]:
try:
if tensorboard_log_dir is not None:
tb_writer = TBWriter(tensorboard_log_dir)
else:
tb_writer = None
except ImportError:
log.error('Failed to import SummaryWriter from torch.utils.tensorboard. Failed to initialize Tensorboard '
'logger. Install appropriate Pytorch version to use this logger or remove tensorboard_log_dir '
'parameter from the train parameters list in the configuration file.')
tb_writer = None
return tb_writer
56 changes: 6 additions & 50 deletions deeppavlov/core/trainers/fit_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
import time
from itertools import islice
from logging import getLogger
from pathlib import Path
from typing import Tuple, Dict, Union, Optional, Iterable, Any, Collection

from deeppavlov.core.commands.infer import build_model
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.chainer import Chainer
from deeppavlov.core.common.params import from_params
from deeppavlov.core.common.registry import register
Expand All @@ -31,6 +29,7 @@
from deeppavlov.core.trainers.utils import Metric, parse_metrics, prettify_metrics, NumpyArrayEncoder

log = getLogger(__name__)
report_log = getLogger('train_report')


@register('fit_trainer')
Expand All @@ -50,8 +49,6 @@ class FitTrainer:
evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``)
show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch
in evaluation logs (default is ``False``)
tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None
(default is ``None``)
max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative
(default is ``-1``)
**kwargs: additional parameters whose names will be logged but otherwise ignored
Expand All @@ -61,7 +58,6 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1,
metrics: Iterable[Union[str, dict]] = ('accuracy',),
evaluation_targets: Iterable[str] = ('valid', 'test'),
show_examples: bool = False,
tensorboard_log_dir: Optional[Union[str, Path]] = None,
max_test_batches: int = -1,
**kwargs) -> None:
if kwargs:
Expand All @@ -72,23 +68,7 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1,
self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params)
self.evaluation_targets = tuple(evaluation_targets)
self.show_examples = show_examples

self.max_test_batches = None if max_test_batches < 0 else max_test_batches

self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir
if tensorboard_log_dir is not None:
try:
# noinspection PyPackageRequirements
# noinspection PyUnresolvedReferences
import tensorflow
except ImportError:
log.warning('TensorFlow could not be imported, so tensorboard log directory'
f'`{self.tensorboard_log_dir}` will be ignored')
self.tensorboard_log_dir = None
else:
self.tensorboard_log_dir = expand_path(tensorboard_log_dir)
self._tf = tensorflow

self._built = False
self._saved = False
self._loaded = False
Expand All @@ -110,37 +90,15 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator]
targets = [targets]

if self.batch_size > 0 and callable(getattr(component, 'partial_fit', None)):
writer = None

for i, (x, y) in enumerate(iterator.gen_batches(self.batch_size, shuffle=False)):
preprocessed = self._chainer.compute(x, y, targets=targets)
# noinspection PyUnresolvedReferences
result = component.partial_fit(*preprocessed)

if result is not None and self.tensorboard_log_dir is not None:
if writer is None:
writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir /
f'partial_fit_{component_index}_log'))
for name, score in result.items():
summary = self._tf.Summary()
summary.value.add(tag='partial_fit/' + name, simple_value=score)
writer.add_summary(summary, i)
writer.flush()
component.partial_fit(*preprocessed)
else:
preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets)
if len(targets) == 1:
preprocessed = [preprocessed]
result: Optional[Dict[str, Iterable[float]]] = component.fit(*preprocessed)

if result is not None and self.tensorboard_log_dir is not None:
writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir /
f'fit_log_{component_index}'))
for name, scores in result.items():
for i, score in enumerate(scores):
summary = self._tf.Summary()
summary.value.add(tag='fit/' + name, simple_value=score)
writer.add_summary(summary, i)
writer.flush()
component.fit(*preprocessed)

component.save()

Expand Down Expand Up @@ -240,15 +198,14 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]],

return report

def evaluate(self, iterator: DataLearningIterator, evaluation_targets: Optional[Iterable[str]] = None, *,
print_reports: bool = True) -> Dict[str, dict]:
def evaluate(self, iterator: DataLearningIterator,
evaluation_targets: Optional[Iterable[str]] = None) -> Dict[str, dict]:
"""
Run :meth:`test` on multiple data types using provided data iterator

Args:
iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation
evaluation_targets: iterable of data types to evaluate on
print_reports: a flag used to print evaluation reports as json lines

Returns:
a dictionary with data types as keys and evaluation reports as values
Expand All @@ -263,7 +220,6 @@ def evaluate(self, iterator: DataLearningIterator, evaluation_targets: Optional[
data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False)
report = self.test(data_gen)
res[data_type] = report
if print_reports:
print(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder))
report_log.info(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder))

return res
36 changes: 15 additions & 21 deletions deeppavlov/core/trainers/nn_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
from deeppavlov.core.trainers.fit_trainer import FitTrainer
from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder

from deeppavlov.core.common.log_events import get_tb_writer
log = getLogger(__name__)
report_log = getLogger('train_report')


@register('nn_trainer')
Expand Down Expand Up @@ -105,8 +106,7 @@ def __init__(self, chainer_config: dict, *,
log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1,
**kwargs) -> None:
super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets,
show_examples=show_examples, tensorboard_log_dir=tensorboard_log_dir,
max_test_batches=max_test_batches, **kwargs)
show_examples=show_examples, max_test_batches=max_test_batches, **kwargs)
if train_metrics is None:
self.train_metrics = self.metrics
else:
Expand Down Expand Up @@ -145,10 +145,7 @@ def _improved(op):
self.last_result = {}
self.losses = []
self.start_time: Optional[float] = None

if self.tensorboard_log_dir is not None:
self.tb_train_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'train_log'))
self.tb_valid_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'valid_log'))
self.tb_writer = get_tb_writer(tensorboard_log_dir)

def save(self) -> None:
if self._loaded:
Expand All @@ -174,14 +171,13 @@ def _validate(self, iterator: DataLearningIterator,

metrics = list(report['metrics'].items())

if tensorboard_tag is not None and self.tensorboard_log_dir is not None:
summary = self._tf.Summary()
for name, score in metrics:
summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score)
if tensorboard_tag is not None and self.tb_writer is not None:
if tensorboard_index is None:
tensorboard_index = self.train_batches_seen
self.tb_valid_writer.add_summary(summary, tensorboard_index)
self.tb_valid_writer.flush()
for name, score in metrics:
self.tb_writer.write_valid(tag=f'{tensorboard_tag}/{name}', scalar_value=score,
global_step=tensorboard_index)
self.tb_writer.flush()

m_name, score = metrics[0]

Expand Down Expand Up @@ -217,7 +213,7 @@ def _validate(self, iterator: DataLearningIterator,

self._send_event(event_name='after_validation', data=report)
report = {'valid': report}
print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))
report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))
self.validation_number += 1

def _log(self, iterator: DataLearningIterator,
Expand Down Expand Up @@ -246,18 +242,16 @@ def _log(self, iterator: DataLearningIterator,
self.losses.clear()
metrics.append(('loss', report['loss']))

if metrics and self.tensorboard_log_dir is not None:
summary = self._tf.Summary()

if metrics and self.tb_writer is not None:
for name, score in metrics:
summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score)
self.tb_train_writer.add_summary(summary, tensorboard_index)
self.tb_train_writer.flush()
self.tb_writer.write_train(tag=f'{tensorboard_tag}/{name}', scalar_value=score,
global_step=tensorboard_index)
self.tb_writer.flush()

self._send_event(event_name='after_train_log', data=report)

report = {'train': report}
print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))
report_log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder))

def _send_event(self, event_name: str, data: Optional[dict] = None) -> None:
report = {
Expand Down
16 changes: 16 additions & 0 deletions deeppavlov/utils/settings/log_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@
],
"propagate": true
},
"train_report": {
"level": "INFO",
"handlers": [
"train_handler"
],
"propagate": true
},
"filelock": {
"level": "WARNING",
"handlers": [
Expand All @@ -39,6 +46,9 @@
"uvicorn_fmt": {
"format": "%(asctime)s %(message)s",
"datefmt": "%Y-%m-%d %H:%M:%S"
},
"message": {
"format": "%(message)s"
}
},
"handlers": {
Expand Down Expand Up @@ -66,6 +76,12 @@
"formatter": "uvicorn_fmt",
"stream": "ext://sys.stdout",
"filters": ["probeFilter"]
},
"train_handler": {
"class": "logging.StreamHandler",
"level": "INFO",
"formatter": "message",
"stream": "ext://sys.stdout"
}
},
"filters": {
Expand Down