Trainer.debug()

dingguanglei · Nov 20, 2018 · ec8e822 · ec8e822
1 parent 5177f40
commit ec8e822
Show file tree

Hide file tree

Showing 19 changed files with 512 additions and 248 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ apply kaiming weight init!
 ===> Building optimizer
 ===> Training
 using `tensorboard --logdir=log` to see learning curves and net structure.
-training and valid data, configures info and checkpoint were save in `log` directory.
+training and valid_epoch data, configures info and checkpoint were save in `log` directory.
   0%|                                                                                        | 0/10 [00:00<?, ?epoch/s]
 0step [00:00, ?step/s]
 ```
@@ -146,16 +146,16 @@ class Dataloaders_factory(metaclass=ABCMeta):
         nsteps_train
 
 ```
-To build your dataset class, including train, valid and test. You need to do as following. 
-* Define datasets. (If you don't define test dataset, it will be replaced by valid datasaet)
+To build your dataset class, including train, valid_epoch and test. You need to do as following. 
+* Define datasets. (If you don't define test dataset, it will be replaced by valid_epoch datasaet)
 * Define transforms. (Default is available)
 
 Example:
 Define a datasets by using `FashionMNIST()`. 
 
 Using default transform.
 
-Don't define test dataset and using valid dataset  instead of test dataset. 
+Don't define test dataset and using valid_epoch dataset  instead of test dataset. 
 
 ```python
 class FashionMNIST(Dataloaders_factory):
@@ -214,9 +214,9 @@ class Model(object):
     """to assemble a model and weights from paths or passing parameters."""
 
     def load_point(self, model_name, epoch, logdir="log"):
-    check_point
+    _check_point
 
-    def check_point(self, model_name, epoch, logdir="check_point load_point
+    def _check_point(self, model_name, epoch, logdi_check_pointint
 
    check_pointParams(self, proto_model):
     load_pointthe total parameters of model."""
@@ -238,8 +238,8 @@ To wrap your pytorch model. You need to do as following.
     * You must pass a model to this method whether it is path or model.
     * For `weight_or_path`, if it is not None. 
    It can be a path or weight OrderedDict and it will be applied in model.
-* Do check_point.
-    * Using `check_point(model_name, epoch, logdir="log")` to scheck_pointodel checkpointcheck_point/checkpoint/`.
+* Do _check_point.
+    * Using `_check_point(model_name, epoch, logdir="log")` to scheck_pointodel checkpointcheck_point/checkpoint/`.
     * The Filename is `Weights_{model_name}_{epoch}.pth` and `Model_{model_name}_{epoch}.pth`
     * The `loadPoint()` is exact the opposite.
     

diff --git a/docs/source/Build your own trainer.rst b/docs/source/Build your own trainer.rst
@@ -45,7 +45,7 @@ Following these setps:
 
 * Rewrite your own transforms to ``self.train_transform_list`` and ``self.valid_transform_list``. (Not necessary)
 * Register your training dataset to ``self.dataset_train`` by using ``self.train_transform_list``
-* Register your valid dataset to ``self.dataset_valid`` by using ``self.valid_transform_list``
+* Register your valid_epoch dataset to ``self.dataset_valid`` by using ``self.valid_transform_list``
 
 Example::
 
@@ -167,11 +167,11 @@ Something like this::
 
      def train():
         for epoch in range(nepochs):
-            self.update_config_info() # record info
+            self._record_configs() # record info
             self.train_epoch(subbar_disable)
-            self.valid()
-            self.change_lr()
-            self.check_point()
+            self.valid_epoch()
+            self._change_lr()
+            self._check_point()
         self.test()
 
 Every method will be rewrite by the second level templates. It only defines a rough framework.
@@ -215,7 +215,7 @@ You must using ``self.step`` to record the training step.
             self.output = self.net(self.input)
             # this is defined in SupTrainer.
             # using `self.compute_loss` and `self.opt` to do a backward.
-            self.train_iteration(self.opt, self.compute_loss, tag="Train")
+            self._train_iteration(self.opt, self.compute_loss, tag="Train")
 
     @abstractmethod
     def compute_loss(self):
@@ -240,20 +240,20 @@ You must using ``self.step`` to record the training step.
 
     @abstractmethod
     def compute_valid(self):
-        """Compute the valid variables for visualization.
+        """Compute the valid_epoch variables for visualization.
         Rewrite by the next templates.
         Example::
 
           var_dic = {}
-          # visualize the valid curve of CrossEntropyLoss
+          # visualize the valid_epoch curve of CrossEntropyLoss
           var_dic["CEP"] = loss = CrossEntropyLoss()(self.output, self.labels.squeeze().long())
 
           _, predict = torch.max(self.output.detach(), 1)  # 0100=>1  0010=>2
           total = predict.size(0) * 1.0
           labels = self.labels.squeeze().long()
           correct = predict.eq(labels).cpu().sum().float()
           acc = correct / total
-          # visualize the valid curve of accuracy
+          # visualize the valid_epoch curve of accuracy
           var_dic["ACC"] = acc
           return var_dic
         """
@@ -262,15 +262,15 @@ For some other things. These are not necessary
 
 .. code-block:: python
 
-    def change_lr(self):
+    def _change_lr(self):
         # If you need lr decay strategy, write this.
         self.opt.do_lr_decay()
 
-    def check_point(self):
+    def _check_point(self):
         # If you need checkpoint, write this.
-        self.net.check_point("classmodel", self.current_epoch, self.logdir)
+        self.net._check_point("classmodel", self.current_epoch, self.logdir)
 
-    def update_config_info(self):
+    def _record_configs(self):
         # If you need to record the params changing such as lr changing.
         self.loger.regist_config(self.opt, self.current_epoch)
         # for self.performance.configure

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -62,7 +62,7 @@ Then you will see something like this as following.
     ===> Building optimizer
     ===> Training
     using `tensorboard --logdir=log` to see learning curves and net structure.
-    training and valid data, configures info and checkpoint were save in `log` directory.
+    training and valid_epoch data, configures info and checkpoint were save in `log` directory.
       0%|            | 0/10 [00:00<?, ?epoch/s]
     0step [00:00, ?step/s]
 

diff --git a/docs/source/quick start.rst b/docs/source/quick start.rst
@@ -34,7 +34,7 @@ Then you will see something like this as following.
     ===> Building optimizer
     ===> Training
     using `tensorboard --logdir=log` to see learning curves and net structure.
-    training and valid data, configures info and checkpoint were save in `log` directory.
+    training and valid_epoch data, configures info and checkpoint were save in `log` directory.
       0%|            | 0/10 [00:00<?, ?epoch/s]
     0step [00:00, ?step/s]
 

diff --git a/examples/generate_trainer.py b/examples/generate_trainer.py
@@ -68,7 +68,7 @@ def compute_g_loss(self):
         var_dic["LOSS_G"] = loss_g = -d_fake.mean()
         return loss_g, var_dic
 
-    def valid(self):
+    def valid_epoch(self):
         # register a fixed input
         if self.fixed_input is None:
             self.fixed_input = Variable(torch.randn((32, *self.latent_shape))).to(self.device)

diff --git a/generate_trainer.py b/generate_trainer.py
@@ -59,7 +59,7 @@ def compute_g_loss(self):
     #     var_dic["WD"] = w_distance = (d_real.mean() - d_fake.mean()).detach()
     #     return var_dic
 
-    def valid(self):
+    def valid_epoch(self):
         if self.fixed_input is None:
             self.fixed_input = Variable()
             if self.use_gpu:

diff --git a/jdit/dataset.py b/jdit/dataset.py
@@ -4,20 +4,19 @@
 from typing import Union
 from abc import ABCMeta, abstractmethod
 
-
 class DataLoadersFactory(metaclass=ABCMeta):
     """This is a super class of dataloader.
 
     It defines same basic attributes and methods.
 
     * For training data: ``train_dataset``, ``loader_train``, ``nsteps_train`` .
-      Others such as ``valid`` and ``test`` have the same naming format.
+      Others such as ``valid_epoch`` and ``test`` have the same naming format.
     * For transform, you can define your own transforms.
-    * If you don't have test set, it will be replaced by valid dataset.
+    * If you don't have test set, it will be replaced by valid_epoch dataset.
 
     It will build dataset following these setps:
 
-      #. ``build_transforms()`` To build transforms for training dataset and valid.
+      #. ``build_transforms()`` To build transforms for training dataset and valid_epoch.
          You can rewrite this method for your own transform. It will be used in ``build_datasets()``
       #. ``build_datasets()`` You must rewrite this method to load your own dataset
          by passing datasets to ``self.dataset_train`` and ``self.dataset_valid`` .
@@ -74,22 +73,22 @@ def __init__(self, root: str, batch_shape: Union[tuple, list], num_workers=-1, s
         self.dataset_valid: datasets = None
         self.dataset_test: datasets = None
 
-        self.loader_train:DataLoader = None
-        self.loader_valid:DataLoader = None
-        self.loader_test:DataLoader = None
+        self.loader_train: DataLoader = None
+        self.loader_valid: DataLoader = None
+        self.loader_test: DataLoader = None
 
-        self.nsteps_train:int = None
-        self.nsteps_valid:int = None
-        self.nsteps_test:int = None
+        self.nsteps_train: int = None
+        self.nsteps_valid: int = None
+        self.nsteps_test: int = None
 
         self.sample_dataset_size = subdata_size
 
         self.build_transforms()
         self.build_datasets()
         self.build_loaders()
 
-    def build_transforms(self, resize:int=32):
-        """ This will build transforms for training and valid.
+    def build_transforms(self, resize: int = 32):
+        """ This will build transforms for training and valid_epoch.
 
         You can rewrite this method to build your own transforms.
         Don't forget to register your transforms to ``self.train_transform_list`` and ``self.valid_transform_list``
@@ -114,7 +113,7 @@ def build_datasets(self):
         """ You must to rewrite this method to load your own datasets.
 
         * :attr:`self.dataset_train` . Assign a training ``dataset`` to this.
-        * :attr:`self.dataset_valid` . Assign a valid ``dataset`` to this.
+        * :attr:`self.dataset_valid` . Assign a valid_epoch ``dataset`` to this.
         * :attr:`self.dataset_test` is optional. Assign a test ``dataset`` to this.
           If not, it will be replaced by ``self.dataset_valid`` .
 
@@ -287,6 +286,9 @@ def build_datasets(self):
         self.dataset_valid = datasets.CIFAR10(self.root, train=False, download=True,
                                               transform=transforms.Compose(self.valid_transform_list))
 
+    def build_transforms(self, resize: int = 32):
+        super(Lsun, self).build_transforms(resize)
+
 
 def get_mnist_dataloaders(root=r'..\data', batch_size=128):
     """MNIST dataloader with (32, 32) sized images."""

diff --git a/jdit/model.py b/jdit/model.py
@@ -5,6 +5,8 @@
 from typing import Union
 from collections import OrderedDict
 from types import FunctionType
+
+
 class _cached_property(object):
     """
     Decorator that converts a method with a single self argument into a
@@ -30,7 +32,7 @@ class Model(object):
     r"""A warapper of pytorch ``module`` .
 
     In the simplest case, we use a raw pytorch ``module`` to assemble a ``Model`` of this class.
-    It can be more convenient to use some feather method, such ``check_point`` , ``load_model`` and so on.
+    It can be more convenient to use some feather method, such ``_check_point`` , ``load_model`` and so on.
 
     * :attr:`proto_model` is the core model in this class.
       It is no necessary to passing a ``module`` when you init a ``Model`` .
@@ -89,16 +91,19 @@ class Model(object):
 
     """
 
-    def __init__(self, proto_model: Module =None, gpu_ids_abs: Union[list,tuple]=(), init_method: [str, FunctionType] = "kaiming",
+    def __init__(self, proto_model: Module = None, gpu_ids_abs: Union[list, tuple] = (),
+                 init_method: Union[str, FunctionType, None] = "kaiming",
                  show_structure=False, verbose=True):
         if not gpu_ids_abs:
             gpu_ids_abs = []
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids_abs])
         self.gpu_ids = [i for i in range(len(gpu_ids_abs))]
-        self.model = None
+        self.model: Union[DataParallel, Module] = None
+        # self.model_name :str= None
         self.weights_init = None
         self.init_fc = None
-        self.num_params = 0
+        self.init_name: str = None
+        self.num_params: int = 0
         self.verbose = verbose
         if proto_model is not None:
             self.define(proto_model, self.gpu_ids, init_method, show_structure)
@@ -109,7 +114,7 @@ def __call__(self, *args, **kwargs):
     def __getattr__(self, item):
         return getattr(self.model, item)
 
-    def define(self, proto_model: Module, gpu_ids: Union[list, tuple], init_method: Union[str, FunctionType],
+    def define(self, proto_model: Module, gpu_ids: Union[list, tuple], init_method: Union[str, FunctionType, None] ,
                show_structure: bool):
         """Define and wrap a pytorch module, according to CPU, GPU and multi-GPUs.
 
@@ -125,8 +130,8 @@ def define(self, proto_model: Module, gpu_ids: Union[list, tuple], init_method:
         """
         self.num_params = self.print_network(proto_model, show_structure)
         self.model = self._set_device(proto_model, gpu_ids)
-        init_name = self._apply_weight_init(init_method, proto_model)
-        self._print("apply %s weight init!" % init_name)
+        self.init_name = self._apply_weight_init(init_method, proto_model)
+        self._print("apply %s weight init!" % self.init_name)
 
     def print_network(self, proto_model: Module, show_structure=False):
         """Print total number of parameters and structure of network
@@ -290,13 +295,18 @@ def count_params(self, proto_model: Module):
             num_params += param.numel()
         return num_params
 
-    #
-    # @_cached_property
-    # def paramNum(self):
-    #     if isinstance(self.model, DataParallel):
-    #         return self.count_params(self.model.module)
-    #     else:
-    #         return self.count_params(self.model)
+    def reset_device(self, gpu_ids_abs: list = None):
+        assert self.model is not None, "You must have a `model` before you reset device!"
+
+        if gpu_ids_abs is None:
+            gpu_ids_abs = []
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids_abs])
+        self.gpu_ids = [i for i in range(len(gpu_ids_abs))]
+        if isinstance(self.model, DataParallel):
+            proto_model = self.model.module
+        else:
+            proto_model = self.model
+        self.define(proto_model, self.gpu_ids, None, False)
 
     def _apply_weight_init(self, init_method: Union[str, FunctionType], proto_model: Module):
         init_name = "No"
@@ -358,10 +368,12 @@ def _fix_weights(self, weights: OrderedDict):
     def _set_device(self, proto_model: Module, gpu_ids: list):
         gpu_available = torch.cuda.is_available()
         model_name = proto_model.__class__.__name__
-        if (len(gpu_ids) == 1) & gpu_available:
+        if (len(gpu_ids) == 1):
+            assert gpu_available, "No gpu available! torch.cuda.is_available() is False"
             proto_model = proto_model.cuda(gpu_ids[0])
             self._print("%s model use GPU(%d)!" % (model_name, gpu_ids[0]))
-        elif (len(gpu_ids) > 1) & gpu_available:
+        elif (len(gpu_ids) > 1):
+            assert gpu_available, "No gpu available! torch.cuda.is_available() is False"
             proto_model = DataParallel(proto_model.cuda(), gpu_ids)
             self._print("%s dataParallel use GPUs%s!" % (model_name, gpu_ids))
         else:
@@ -375,14 +387,14 @@ def _print(self, str: str):
     @property
     def configure(self):
         config_dic = dict()
-        config_dic["model_name"] = str(self.model.__class__.__name__)
-        if hasattr(self.init_fc, __name__):
-            config_dic["init_method"] = str(self.init_fc.__name__)
+        if isinstance(self.model, DataParallel):
+            config_dic["model_name"] = str(self.model.module.__class__.__name__)
+        elif isinstance(self.model, Module):
+            config_dic["model_name"] = str(self.model.__class__.__name__)
         else:
-            config_dic["init_method"] = str(self.init_fc)
+            config_dic["model_name"] = 'None'
+        config_dic["init_method"] = str(self.init_name)
         config_dic["gpus"] = len(self.gpu_ids)
         config_dic["total_params"] = self.num_params
-        config_dic["structure"] = []
-        for item in self.model._modules.items():
-            config_dic["structure"].append(str(item))
+        config_dic["structure"] = str(self.model)
         return config_dic