Merge pull request NVIDIA#1262 from lukeyeager/fix-batch-accumulation

Fix batch accumulation
ethantang95 · Nov 16, 2016 · f7cf695 · f7cf695
2 parents 75ec4a0 + b8e905a
commit f7cf695
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 24 deletions.
diff --git a/digits/dataset/images/classification/test_imageset_creator.py b/digits/dataset/images/classification/test_imageset_creator.py
@@ -15,22 +15,18 @@
 import PIL.Image
 
 
-IMAGE_SIZE = 10
-IMAGE_COUNT = 10  # per category
-
-
-def create_classification_imageset(folder, image_size=None, image_count=None, add_unbalanced_category=False):
+def create_classification_imageset(
+        folder,
+        image_size=10,
+        image_count=10,
+        add_unbalanced_category=False,
+):
     """
     Creates a folder of folders of images for classification
 
     If requested to add an unbalanced category then a category is added with
     half the number of samples of other categories
     """
-    if image_size is None:
-        image_size = IMAGE_SIZE
-    if image_count is None:
-        image_count = IMAGE_COUNT
-
     # Stores the relative path of each image of the dataset
     paths = defaultdict(list)
 

diff --git a/digits/dataset/images/classification/test_views.py b/digits/dataset/images/classification/test_views.py
@@ -15,7 +15,7 @@
 from bs4 import BeautifulSoup
 import PIL.Image
 
-from .test_imageset_creator import create_classification_imageset, IMAGE_COUNT as DUMMY_IMAGE_COUNT
+from .test_imageset_creator import create_classification_imageset
 from digits import test_utils
 import digits.test_views
 
@@ -64,6 +64,7 @@ class BaseViewsTestWithImageset(BaseViewsTest):
     Provides an imageset and some functions
     """
     # Inherited classes may want to override these default attributes
+    IMAGE_COUNT = 10  # per class
     IMAGE_HEIGHT = 10
     IMAGE_WIDTH = 10
     IMAGE_CHANNELS = 3
@@ -78,8 +79,11 @@ def setUpClass(cls):
         super(BaseViewsTestWithImageset, cls).setUpClass()
         cls.imageset_folder = tempfile.mkdtemp()
         # create imageset
-        cls.imageset_paths = create_classification_imageset(cls.imageset_folder,
-                                                            add_unbalanced_category=cls.UNBALANCED_CATEGORY)
+        cls.imageset_paths = create_classification_imageset(
+            cls.imageset_folder,
+            image_count=cls.IMAGE_COUNT,
+            add_unbalanced_category=cls.UNBALANCED_CATEGORY,
+        )
         cls.created_datasets = []
 
     @classmethod
@@ -363,7 +367,7 @@ def check_image_count(self, type):
                 assert parse_info['val_count'] == 0
                 image_count = parse_info['test_count']
         assert self.categoryCount() == parse_info['label_count']
-        assert image_count == DUMMY_IMAGE_COUNT * parse_info['label_count'], 'image count mismatch'
+        assert image_count == self.IMAGE_COUNT * parse_info['label_count'], 'image count mismatch'
         assert self.delete_dataset(job_id) == 200, 'delete failed'
         assert not self.dataset_exists(job_id), 'dataset exists after delete'
 
@@ -375,9 +379,9 @@ def test_max_per_class(self):
             yield self.check_max_per_class, type
 
     def check_max_per_class(self, type):
-        # create dataset, asking for at most DUMMY_IMAGE_COUNT/2 images per class
-        assert DUMMY_IMAGE_COUNT % 2 == 0
-        max_per_class = DUMMY_IMAGE_COUNT / 2
+        # create dataset, asking for at most IMAGE_COUNT/2 images per class
+        assert self.IMAGE_COUNT % 2 == 0
+        max_per_class = self.IMAGE_COUNT / 2
         data = {'folder_pct_val': 0}
         if type == 'train':
             data['folder_train_max_per_class'] = max_per_class
@@ -418,7 +422,7 @@ def test_min_per_class(self):
     def check_min_per_class(self, type):
         # create dataset, asking for one more image per class
         # than available in the "unbalanced" category
-        min_per_class = DUMMY_IMAGE_COUNT / 2 + 1
+        min_per_class = self.IMAGE_COUNT / 2 + 1
         data = {'folder_pct_val': 0}
         if type == 'train':
             data['folder_train_min_per_class'] = min_per_class

diff --git a/digits/model/images/classification/test_views.py b/digits/model/images/classification/test_views.py
@@ -3,6 +3,7 @@
 
 import itertools
 import json
+import math
 import os
 import shutil
 import tempfile
@@ -16,13 +17,16 @@
     from StringIO import StringIO
 
 from bs4 import BeautifulSoup
+from google.protobuf import text_format
 
 from digits.config import config_value
 import digits.dataset.images.classification.test_views
+from digits.frameworks import CaffeFramework
 import digits.test_views
 from digits import test_utils
 import digits.webapp
 
+import caffe_pb2
 
 # May be too short on a slow system
 TIMEOUT_DATASET = 45
@@ -101,6 +105,10 @@ def model_exists(cls, job_id):
     def model_status(cls, job_id):
         return cls.job_status(job_id, 'models')
 
+    @classmethod
+    def model_info(cls, job_id):
+        return cls.job_info(job_id, 'models')
+
     @classmethod
     def abort_model(cls, job_id):
         return cls.abort_job(job_id, job_type='models')
@@ -1254,3 +1262,33 @@ def test_sweep(self):
             assert self.model_wait_completion(job_id) == 'Done', 'create failed'
             assert self.delete_model(job_id) == 200, 'delete failed'
             assert not self.model_exists(job_id), 'model exists after delete'
+
+
+@unittest.skipIf(
+    not CaffeFramework().can_accumulate_gradients(),
+    'This version of Caffe cannot accumulate gradients')
+class TestBatchAccumulationCaffe(BaseViewsTestWithDataset, test_utils.CaffeMixin):
+    TRAIN_EPOCHS = 1
+    IMAGE_COUNT = 10  # per class
+
+    def test_batch_accumulation_calculations(self):
+        batch_size = 10
+        batch_accumulation = 2
+
+        job_id = self.create_model(
+            batch_size=batch_size,
+            batch_accumulation=batch_accumulation,
+        )
+        assert self.model_wait_completion(job_id) == 'Done', 'create failed'
+        info = self.model_info(job_id)
+        solver = caffe_pb2.SolverParameter()
+        with open(os.path.join(info['directory'], info['solver file']), 'r') as infile:
+            text_format.Merge(infile.read(), solver)
+        assert solver.iter_size == batch_accumulation, \
+            'iter_size is %d instead of %d' % (solver.iter_size, batch_accumulation)
+        max_iter = int(math.ceil(
+            float(self.TRAIN_EPOCHS * self.IMAGE_COUNT * 3) /
+            (batch_size * batch_accumulation)
+        ))
+        assert solver.max_iter == max_iter,\
+            'max_iter is %d instead of %d' % (solver.max_iter, max_iter)
diff --git a/digits/model/tasks/caffe_train.py b/digits/model/tasks/caffe_train.py
@@ -525,8 +525,10 @@ def save_files_classification(self):
             solver.iter_size = self.batch_accumulation
 
         # Epochs -> Iterations
-        train_iter = int(math.ceil(float(self.dataset.get_entry_count(
-            constants.TRAIN_DB)) / train_data_layer.data_param.batch_size))
+        train_iter = int(math.ceil(
+            float(self.dataset.get_entry_count(constants.TRAIN_DB)) /
+            (train_data_layer.data_param.batch_size * solver.iter_size)
+        ))
         solver.max_iter = train_iter * self.train_epochs
         snapshot_interval = self.snapshot_interval * train_iter
         if 0 < snapshot_interval <= 1:
@@ -598,7 +600,7 @@ def save_files_classification(self):
         # Display 8x per epoch, or once per 5000 images, whichever is more frequent
         solver.display = max(1, min(
             int(math.floor(float(solver.max_iter) / (self.train_epochs * 8))),
-            int(math.ceil(5000.0 / train_data_layer.data_param.batch_size))
+            int(math.ceil(5000.0 / (train_data_layer.data_param.batch_size * solver.iter_size)))
         ))
 
         if self.random_seed is not None:
@@ -753,8 +755,10 @@ def save_files_generic(self):
             solver.iter_size = self.batch_accumulation
 
         # Epochs -> Iterations
-        train_iter = int(math.ceil(float(self.dataset.get_entry_count(constants.TRAIN_DB)) /
-                                   train_image_data_layer.data_param.batch_size))
+        train_iter = int(math.ceil(
+            float(self.dataset.get_entry_count(constants.TRAIN_DB)) /
+            (train_image_data_layer.data_param.batch_size * solver.iter_size)
+        ))
         solver.max_iter = train_iter * self.train_epochs
         snapshot_interval = self.snapshot_interval * train_iter
         if 0 < snapshot_interval <= 1:
@@ -821,7 +825,7 @@ def save_files_generic(self):
         # Display 8x per epoch, or once per 5000 images, whichever is more frequent
         solver.display = max(1, min(
             int(math.floor(float(solver.max_iter) / (self.train_epochs * 8))),
-            int(math.ceil(5000.0 / train_image_data_layer.data_param.batch_size))
+            int(math.ceil(5000.0 / (train_image_data_layer.data_param.batch_size * solver.iter_size)))
         ))
 
         if self.random_seed is not None: