Skip to content

Commit

Permalink
[fix][OSS] adding an assert for empty shards + corresponding unit test (
Browse files Browse the repository at this point in the history
#406)

* adding an assert + corresponding unit test
* updated changelog
* adjusting the adascale tests
  • Loading branch information
blefaudeux authored Feb 22, 2021
1 parent a606e84 commit 279b802
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## NEXT - TBD
### Fixed
- Catch corner case when the model is too small with respect to the world size, and shards are empty ([#406] (https://github.com/facebookresearch/fairscale/pull/406))

## [0.1.7] - 2021-02-19
### Fixed
Expand Down
7 changes: 7 additions & 0 deletions fairscale/optim/oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ def partition_parameters(self) -> List[List[dict]]:
param_group_rank["params"] = params
self._partition_parameters[rank].append(param_group_rank)

assert min(sum(len(pg["params"]) for pg in partition) for partition in self._partition_parameters) > 0, (
"One or more empty shards detected, the world size is too big or the model too small.\n"
+ "Please reduce your world size if this is the model you would like to train\n"
+ f"Current world size: {self.world_size}\n"
+ "Current number of parameters: {}".format(sum(len(pg["params"]) for pg in self.param_groups))
)

return self._partition_parameters

@property
Expand Down
8 changes: 4 additions & 4 deletions fairscale/utils/golden_testing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@

adascale_test_data = [
# "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 4.0 / 3},
{"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
{"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.4688796680497926},
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.8472893901708},
# "inputs" to trigger multiple iteration tests, which make sure the
# smoothing factor calculation is also covered.
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.6720968158031417},
]
2 changes: 1 addition & 1 deletion tests/optim/test_ddp_adascale.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _dist_init(rank, world_size, tempfile_name, backend):
def _test_basic_func(rank, world_size, tempfile_name, test_case):
_dist_init(rank, world_size, tempfile_name, backend="nccl") # Covers nccl

model = Linear(2, 2, bias=False)
model = Linear(2, 2)
model.to("cuda")
model = DDP(model, device_ids=[rank])
optim = AdaScale(SGD(model.parameters(), lr=0.1))
Expand Down
18 changes: 16 additions & 2 deletions tests/optim/test_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,7 @@ def test_add_param_group():
if torch.cuda.is_available() and torch.cuda.device_count() < world_size:
world_size = min(world_size, torch.cuda.device_count())

temp_file_name = tempfile.mkstemp()[1]
mp.spawn(run_test_add_param_group, args=(world_size, temp_file_name), nprocs=world_size, join=True)
mp.spawn(run_test_add_param_group, args=(world_size, tempfile.mkstemp()[1]), nprocs=world_size, join=True)


def run_test_zero_grad(rank, world_size, tempfile_name):
Expand Down Expand Up @@ -263,6 +262,21 @@ def test_zero_grad():
mp.spawn(run_test_zero_grad, args=(world_size, temp_file_name), nprocs=world_size, join=True)


def run_test_catch_empty_shardd(rank, world_size, tempfile_name):
dist_init(rank, world_size, tempfile_name, backend="gloo")
m = torch.nn.Linear(1, 1)
with pytest.raises(AssertionError):
_ = optim.OSS(m.parameters(), lr=0.1)

dist.destroy_process_group()


def test_empty_shard():
world_size = 4

mp.spawn(run_test_catch_empty_shardd, args=(world_size, tempfile.mkstemp()[1]), nprocs=world_size, join=True)


def run_test_step(rank, world_size, tempfile_name):
dist_init(rank, world_size, tempfile_name, backend="gloo")
x = torch.tensor([float(rank + 1)], device=rank)
Expand Down
8 changes: 6 additions & 2 deletions tests/optim/test_oss_adascale.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None
_dist_init(rank, world_size, tempfile_name, backend="nccl")

if model is None:
model = Linear(2, 2, bias=False)
model = Linear(2, 2)
model.bias.data.fill_(0.0)

model.to("cuda")
model = DDP(model, device_ids=[rank])

Expand Down Expand Up @@ -65,7 +67,9 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None
optim.zero_grad()

if "expected_gain" in test_case:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
assert np.allclose(optim.gain(), test_case["expected_gain"]), "{} vs {}".format(
optim.gain(), test_case["expected_gain"]
)

if "expected_mean_weight" in test_case:
mean_weight = mean([model.module[i].weight.data.mean().item() for i in range(4)])
Expand Down
2 changes: 1 addition & 1 deletion tests/optim/test_single_node_adascale.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_loss_accum_cpu():
@pytest.mark.parametrize("test_case", adascale_test_data)
def test_grad_accum(test_case, cpu):
"""Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
model = Linear(2, 2, bias=False)
model = Linear(2, 2, bias=True)
if not cpu:
if torch.cuda.device_count() < 1:
pytest.skip("1 GPU is required")
Expand Down

0 comments on commit 279b802

Please sign in to comment.