Skip to content

Commit

Permalink
Trainer.debug()
Browse files Browse the repository at this point in the history
  • Loading branch information
dingguanglei committed Nov 20, 2018
1 parent 5177f40 commit ec8e822
Show file tree
Hide file tree
Showing 19 changed files with 512 additions and 248 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ apply kaiming weight init!
===> Building optimizer
===> Training
using `tensorboard --logdir=log` to see learning curves and net structure.
training and valid data, configures info and checkpoint were save in `log` directory.
training and valid_epoch data, configures info and checkpoint were save in `log` directory.
0%| | 0/10 [00:00<?, ?epoch/s]
0step [00:00, ?step/s]
```
Expand Down Expand Up @@ -146,16 +146,16 @@ class Dataloaders_factory(metaclass=ABCMeta):
nsteps_train

```
To build your dataset class, including train, valid and test. You need to do as following.
* Define datasets. (If you don't define test dataset, it will be replaced by valid datasaet)
To build your dataset class, including train, valid_epoch and test. You need to do as following.
* Define datasets. (If you don't define test dataset, it will be replaced by valid_epoch datasaet)
* Define transforms. (Default is available)

Example:
Define a datasets by using `FashionMNIST()`.

Using default transform.

Don't define test dataset and using valid dataset instead of test dataset.
Don't define test dataset and using valid_epoch dataset instead of test dataset.

```python
class FashionMNIST(Dataloaders_factory):
Expand Down Expand Up @@ -214,9 +214,9 @@ class Model(object):
"""to assemble a model and weights from paths or passing parameters."""

def load_point(self, model_name, epoch, logdir="log"):
check_point
_check_point

def check_point(self, model_name, epoch, logdir="check_point load_point
def _check_point(self, model_name, epoch, logdi_check_pointint

check_pointParams(self, proto_model):
load_pointthe total parameters of model."""
Expand All @@ -238,8 +238,8 @@ To wrap your pytorch model. You need to do as following.
* You must pass a model to this method whether it is path or model.
* For `weight_or_path`, if it is not None.
It can be a path or weight OrderedDict and it will be applied in model.
* Do check_point.
* Using `check_point(model_name, epoch, logdir="log")` to scheck_pointodel checkpointcheck_point/checkpoint/`.
* Do _check_point.
* Using `_check_point(model_name, epoch, logdir="log")` to scheck_pointodel checkpointcheck_point/checkpoint/`.
* The Filename is `Weights_{model_name}_{epoch}.pth` and `Model_{model_name}_{epoch}.pth`
* The `loadPoint()` is exact the opposite.
Expand Down
26 changes: 13 additions & 13 deletions docs/source/Build your own trainer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Following these setps:

* Rewrite your own transforms to ``self.train_transform_list`` and ``self.valid_transform_list``. (Not necessary)
* Register your training dataset to ``self.dataset_train`` by using ``self.train_transform_list``
* Register your valid dataset to ``self.dataset_valid`` by using ``self.valid_transform_list``
* Register your valid_epoch dataset to ``self.dataset_valid`` by using ``self.valid_transform_list``

Example::

Expand Down Expand Up @@ -167,11 +167,11 @@ Something like this::

def train():
for epoch in range(nepochs):
self.update_config_info() # record info
self._record_configs() # record info
self.train_epoch(subbar_disable)
self.valid()
self.change_lr()
self.check_point()
self.valid_epoch()
self._change_lr()
self._check_point()
self.test()

Every method will be rewrite by the second level templates. It only defines a rough framework.
Expand Down Expand Up @@ -215,7 +215,7 @@ You must using ``self.step`` to record the training step.
self.output = self.net(self.input)
# this is defined in SupTrainer.
# using `self.compute_loss` and `self.opt` to do a backward.
self.train_iteration(self.opt, self.compute_loss, tag="Train")
self._train_iteration(self.opt, self.compute_loss, tag="Train")
@abstractmethod
def compute_loss(self):
Expand All @@ -240,20 +240,20 @@ You must using ``self.step`` to record the training step.
@abstractmethod
def compute_valid(self):
"""Compute the valid variables for visualization.
"""Compute the valid_epoch variables for visualization.
Rewrite by the next templates.
Example::
var_dic = {}
# visualize the valid curve of CrossEntropyLoss
# visualize the valid_epoch curve of CrossEntropyLoss
var_dic["CEP"] = loss = CrossEntropyLoss()(self.output, self.labels.squeeze().long())
_, predict = torch.max(self.output.detach(), 1) # 0100=>1 0010=>2
total = predict.size(0) * 1.0
labels = self.labels.squeeze().long()
correct = predict.eq(labels).cpu().sum().float()
acc = correct / total
# visualize the valid curve of accuracy
# visualize the valid_epoch curve of accuracy
var_dic["ACC"] = acc
return var_dic
"""
Expand All @@ -262,15 +262,15 @@ For some other things. These are not necessary

.. code-block:: python
def change_lr(self):
def _change_lr(self):
# If you need lr decay strategy, write this.
self.opt.do_lr_decay()
def check_point(self):
def _check_point(self):
# If you need checkpoint, write this.
self.net.check_point("classmodel", self.current_epoch, self.logdir)
self.net._check_point("classmodel", self.current_epoch, self.logdir)
def update_config_info(self):
def _record_configs(self):
# If you need to record the params changing such as lr changing.
self.loger.regist_config(self.opt, self.current_epoch)
# for self.performance.configure
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Then you will see something like this as following.
===> Building optimizer
===> Training
using `tensorboard --logdir=log` to see learning curves and net structure.
training and valid data, configures info and checkpoint were save in `log` directory.
training and valid_epoch data, configures info and checkpoint were save in `log` directory.
0%| | 0/10 [00:00<?, ?epoch/s]
0step [00:00, ?step/s]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/quick start.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Then you will see something like this as following.
===> Building optimizer
===> Training
using `tensorboard --logdir=log` to see learning curves and net structure.
training and valid data, configures info and checkpoint were save in `log` directory.
training and valid_epoch data, configures info and checkpoint were save in `log` directory.
0%| | 0/10 [00:00<?, ?epoch/s]
0step [00:00, ?step/s]
Expand Down
2 changes: 1 addition & 1 deletion examples/generate_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def compute_g_loss(self):
var_dic["LOSS_G"] = loss_g = -d_fake.mean()
return loss_g, var_dic

def valid(self):
def valid_epoch(self):
# register a fixed input
if self.fixed_input is None:
self.fixed_input = Variable(torch.randn((32, *self.latent_shape))).to(self.device)
Expand Down
2 changes: 1 addition & 1 deletion generate_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def compute_g_loss(self):
# var_dic["WD"] = w_distance = (d_real.mean() - d_fake.mean()).detach()
# return var_dic

def valid(self):
def valid_epoch(self):
if self.fixed_input is None:
self.fixed_input = Variable()
if self.use_gpu:
Expand Down
28 changes: 15 additions & 13 deletions jdit/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from typing import Union
from abc import ABCMeta, abstractmethod


class DataLoadersFactory(metaclass=ABCMeta):
"""This is a super class of dataloader.
It defines same basic attributes and methods.
* For training data: ``train_dataset``, ``loader_train``, ``nsteps_train`` .
Others such as ``valid`` and ``test`` have the same naming format.
Others such as ``valid_epoch`` and ``test`` have the same naming format.
* For transform, you can define your own transforms.
* If you don't have test set, it will be replaced by valid dataset.
* If you don't have test set, it will be replaced by valid_epoch dataset.
It will build dataset following these setps:
#. ``build_transforms()`` To build transforms for training dataset and valid.
#. ``build_transforms()`` To build transforms for training dataset and valid_epoch.
You can rewrite this method for your own transform. It will be used in ``build_datasets()``
#. ``build_datasets()`` You must rewrite this method to load your own dataset
by passing datasets to ``self.dataset_train`` and ``self.dataset_valid`` .
Expand Down Expand Up @@ -74,22 +73,22 @@ def __init__(self, root: str, batch_shape: Union[tuple, list], num_workers=-1, s
self.dataset_valid: datasets = None
self.dataset_test: datasets = None

self.loader_train:DataLoader = None
self.loader_valid:DataLoader = None
self.loader_test:DataLoader = None
self.loader_train: DataLoader = None
self.loader_valid: DataLoader = None
self.loader_test: DataLoader = None

self.nsteps_train:int = None
self.nsteps_valid:int = None
self.nsteps_test:int = None
self.nsteps_train: int = None
self.nsteps_valid: int = None
self.nsteps_test: int = None

self.sample_dataset_size = subdata_size

self.build_transforms()
self.build_datasets()
self.build_loaders()

def build_transforms(self, resize:int=32):
""" This will build transforms for training and valid.
def build_transforms(self, resize: int = 32):
""" This will build transforms for training and valid_epoch.
You can rewrite this method to build your own transforms.
Don't forget to register your transforms to ``self.train_transform_list`` and ``self.valid_transform_list``
Expand All @@ -114,7 +113,7 @@ def build_datasets(self):
""" You must to rewrite this method to load your own datasets.
* :attr:`self.dataset_train` . Assign a training ``dataset`` to this.
* :attr:`self.dataset_valid` . Assign a valid ``dataset`` to this.
* :attr:`self.dataset_valid` . Assign a valid_epoch ``dataset`` to this.
* :attr:`self.dataset_test` is optional. Assign a test ``dataset`` to this.
If not, it will be replaced by ``self.dataset_valid`` .
Expand Down Expand Up @@ -287,6 +286,9 @@ def build_datasets(self):
self.dataset_valid = datasets.CIFAR10(self.root, train=False, download=True,
transform=transforms.Compose(self.valid_transform_list))

def build_transforms(self, resize: int = 32):
super(Lsun, self).build_transforms(resize)


def get_mnist_dataloaders(root=r'..\data', batch_size=128):
"""MNIST dataloader with (32, 32) sized images."""
Expand Down
58 changes: 35 additions & 23 deletions jdit/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from typing import Union
from collections import OrderedDict
from types import FunctionType


class _cached_property(object):
"""
Decorator that converts a method with a single self argument into a
Expand All @@ -30,7 +32,7 @@ class Model(object):
r"""A warapper of pytorch ``module`` .
In the simplest case, we use a raw pytorch ``module`` to assemble a ``Model`` of this class.
It can be more convenient to use some feather method, such ``check_point`` , ``load_model`` and so on.
It can be more convenient to use some feather method, such ``_check_point`` , ``load_model`` and so on.
* :attr:`proto_model` is the core model in this class.
It is no necessary to passing a ``module`` when you init a ``Model`` .
Expand Down Expand Up @@ -89,16 +91,19 @@ class Model(object):
"""

def __init__(self, proto_model: Module =None, gpu_ids_abs: Union[list,tuple]=(), init_method: [str, FunctionType] = "kaiming",
def __init__(self, proto_model: Module = None, gpu_ids_abs: Union[list, tuple] = (),
init_method: Union[str, FunctionType, None] = "kaiming",
show_structure=False, verbose=True):
if not gpu_ids_abs:
gpu_ids_abs = []
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids_abs])
self.gpu_ids = [i for i in range(len(gpu_ids_abs))]
self.model = None
self.model: Union[DataParallel, Module] = None
# self.model_name :str= None
self.weights_init = None
self.init_fc = None
self.num_params = 0
self.init_name: str = None
self.num_params: int = 0
self.verbose = verbose
if proto_model is not None:
self.define(proto_model, self.gpu_ids, init_method, show_structure)
Expand All @@ -109,7 +114,7 @@ def __call__(self, *args, **kwargs):
def __getattr__(self, item):
return getattr(self.model, item)

def define(self, proto_model: Module, gpu_ids: Union[list, tuple], init_method: Union[str, FunctionType],
def define(self, proto_model: Module, gpu_ids: Union[list, tuple], init_method: Union[str, FunctionType, None] ,
show_structure: bool):
"""Define and wrap a pytorch module, according to CPU, GPU and multi-GPUs.
Expand All @@ -125,8 +130,8 @@ def define(self, proto_model: Module, gpu_ids: Union[list, tuple], init_method:
"""
self.num_params = self.print_network(proto_model, show_structure)
self.model = self._set_device(proto_model, gpu_ids)
init_name = self._apply_weight_init(init_method, proto_model)
self._print("apply %s weight init!" % init_name)
self.init_name = self._apply_weight_init(init_method, proto_model)
self._print("apply %s weight init!" % self.init_name)

def print_network(self, proto_model: Module, show_structure=False):
"""Print total number of parameters and structure of network
Expand Down Expand Up @@ -290,13 +295,18 @@ def count_params(self, proto_model: Module):
num_params += param.numel()
return num_params

#
# @_cached_property
# def paramNum(self):
# if isinstance(self.model, DataParallel):
# return self.count_params(self.model.module)
# else:
# return self.count_params(self.model)
def reset_device(self, gpu_ids_abs: list = None):
assert self.model is not None, "You must have a `model` before you reset device!"

if gpu_ids_abs is None:
gpu_ids_abs = []
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids_abs])
self.gpu_ids = [i for i in range(len(gpu_ids_abs))]
if isinstance(self.model, DataParallel):
proto_model = self.model.module
else:
proto_model = self.model
self.define(proto_model, self.gpu_ids, None, False)

def _apply_weight_init(self, init_method: Union[str, FunctionType], proto_model: Module):
init_name = "No"
Expand Down Expand Up @@ -358,10 +368,12 @@ def _fix_weights(self, weights: OrderedDict):
def _set_device(self, proto_model: Module, gpu_ids: list):
gpu_available = torch.cuda.is_available()
model_name = proto_model.__class__.__name__
if (len(gpu_ids) == 1) & gpu_available:
if (len(gpu_ids) == 1):
assert gpu_available, "No gpu available! torch.cuda.is_available() is False"
proto_model = proto_model.cuda(gpu_ids[0])
self._print("%s model use GPU(%d)!" % (model_name, gpu_ids[0]))
elif (len(gpu_ids) > 1) & gpu_available:
elif (len(gpu_ids) > 1):
assert gpu_available, "No gpu available! torch.cuda.is_available() is False"
proto_model = DataParallel(proto_model.cuda(), gpu_ids)
self._print("%s dataParallel use GPUs%s!" % (model_name, gpu_ids))
else:
Expand All @@ -375,14 +387,14 @@ def _print(self, str: str):
@property
def configure(self):
config_dic = dict()
config_dic["model_name"] = str(self.model.__class__.__name__)
if hasattr(self.init_fc, __name__):
config_dic["init_method"] = str(self.init_fc.__name__)
if isinstance(self.model, DataParallel):
config_dic["model_name"] = str(self.model.module.__class__.__name__)
elif isinstance(self.model, Module):
config_dic["model_name"] = str(self.model.__class__.__name__)
else:
config_dic["init_method"] = str(self.init_fc)
config_dic["model_name"] = 'None'
config_dic["init_method"] = str(self.init_name)
config_dic["gpus"] = len(self.gpu_ids)
config_dic["total_params"] = self.num_params
config_dic["structure"] = []
for item in self.model._modules.items():
config_dic["structure"].append(str(item))
config_dic["structure"] = str(self.model)
return config_dic

0 comments on commit ec8e822

Please sign in to comment.