[fix][OSS] adding an assert for empty shards + corresponding unit test (

#406) * adding an assert + corresponding unit test * updated changelog * adjusting the adascale tests
facebookresearch · Feb 22, 2021 · 279b802 · 279b802
1 parent a606e84
commit 279b802
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## NEXT - TBD
+### Fixed
+- Catch corner case when the model is too small with respect to the world size, and shards are empty ([#406] (https://github.com/facebookresearch/fairscale/pull/406))
 
 ## [0.1.7] - 2021-02-19
 ### Fixed

diff --git a/fairscale/optim/oss.py b/fairscale/optim/oss.py
@@ -140,6 +140,13 @@ def partition_parameters(self) -> List[List[dict]]:
                     param_group_rank["params"] = params
                     self._partition_parameters[rank].append(param_group_rank)
 
+            assert min(sum(len(pg["params"]) for pg in partition) for partition in self._partition_parameters) > 0, (
+                "One or more empty shards detected, the world size is too big or the model too small.\n"
+                + "Please reduce your world size if this is the model you would like to train\n"
+                + f"Current world size: {self.world_size}\n"
+                + "Current number of parameters: {}".format(sum(len(pg["params"]) for pg in self.param_groups))
+            )
+
         return self._partition_parameters
 
     @property

diff --git a/fairscale/utils/golden_testing_data.py b/fairscale/utils/golden_testing_data.py
@@ -8,12 +8,12 @@
 
 adascale_test_data = [
     # "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
-    {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
+    {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 4.0 / 3},
     {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
     {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
-    {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
-    {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
+    {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.4688796680497926},
+    {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.8472893901708},
     # "inputs" to trigger multiple iteration tests, which make sure the
     # smoothing factor calculation is also covered.
-    {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
+    {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.6720968158031417},
 ]
diff --git a/tests/optim/test_ddp_adascale.py b/tests/optim/test_ddp_adascale.py
@@ -35,7 +35,7 @@ def _dist_init(rank, world_size, tempfile_name, backend):
 def _test_basic_func(rank, world_size, tempfile_name, test_case):
     _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl
 
-    model = Linear(2, 2, bias=False)
+    model = Linear(2, 2)
     model.to("cuda")
     model = DDP(model, device_ids=[rank])
     optim = AdaScale(SGD(model.parameters(), lr=0.1))

diff --git a/tests/optim/test_oss.py b/tests/optim/test_oss.py
@@ -234,8 +234,7 @@ def test_add_param_group():
     if torch.cuda.is_available() and torch.cuda.device_count() < world_size:
         world_size = min(world_size, torch.cuda.device_count())
 
-    temp_file_name = tempfile.mkstemp()[1]
-    mp.spawn(run_test_add_param_group, args=(world_size, temp_file_name), nprocs=world_size, join=True)
+    mp.spawn(run_test_add_param_group, args=(world_size, tempfile.mkstemp()[1]), nprocs=world_size, join=True)
 
 
 def run_test_zero_grad(rank, world_size, tempfile_name):
@@ -263,6 +262,21 @@ def test_zero_grad():
     mp.spawn(run_test_zero_grad, args=(world_size, temp_file_name), nprocs=world_size, join=True)
 
 
+def run_test_catch_empty_shardd(rank, world_size, tempfile_name):
+    dist_init(rank, world_size, tempfile_name, backend="gloo")
+    m = torch.nn.Linear(1, 1)
+    with pytest.raises(AssertionError):
+        _ = optim.OSS(m.parameters(), lr=0.1)
+
+    dist.destroy_process_group()
+
+
+def test_empty_shard():
+    world_size = 4
+
+    mp.spawn(run_test_catch_empty_shardd, args=(world_size, tempfile.mkstemp()[1]), nprocs=world_size, join=True)
+
+
 def run_test_step(rank, world_size, tempfile_name):
     dist_init(rank, world_size, tempfile_name, backend="gloo")
     x = torch.tensor([float(rank + 1)], device=rank)

diff --git a/tests/optim/test_oss_adascale.py b/tests/optim/test_oss_adascale.py
@@ -37,7 +37,9 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None
     _dist_init(rank, world_size, tempfile_name, backend="nccl")
 
     if model is None:
-        model = Linear(2, 2, bias=False)
+        model = Linear(2, 2)
+        model.bias.data.fill_(0.0)
+
     model.to("cuda")
     model = DDP(model, device_ids=[rank])
 
@@ -65,7 +67,9 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None
         optim.zero_grad()
 
     if "expected_gain" in test_case:
-        assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
+        assert np.allclose(optim.gain(), test_case["expected_gain"]), "{} vs {}".format(
+            optim.gain(), test_case["expected_gain"]
+        )
 
     if "expected_mean_weight" in test_case:
         mean_weight = mean([model.module[i].weight.data.mean().item() for i in range(4)])

diff --git a/tests/optim/test_single_node_adascale.py b/tests/optim/test_single_node_adascale.py
@@ -63,7 +63,7 @@ def test_loss_accum_cpu():
 @pytest.mark.parametrize("test_case", adascale_test_data)
 def test_grad_accum(test_case, cpu):
     """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
-    model = Linear(2, 2, bias=False)
+    model = Linear(2, 2, bias=True)
     if not cpu:
         if torch.cuda.device_count() < 1:
             pytest.skip("1 GPU is required")