Skip to content

Commit

Permalink
Remove sort_iseed_config and related dependencies. (#969)
Browse files Browse the repository at this point in the history
This is no longer needed since isort's version is 5.10

Also fix black version to 22.3.0 to fix issue with click
dependency.

Update files that now fail with new version of black {a = 2 ** 4} ->
{a = 2**4}
  • Loading branch information
another-pjohnson committed Mar 30, 2022
1 parent 1bc96fa commit 72f373c
Show file tree
Hide file tree
Showing 25 changed files with 43 additions and 51 deletions.
7 changes: 1 addition & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
- id: end-of-file-fixer

- repo: https://github.com/ambv/black
rev: 21.10b0
rev: 22.3.0
hooks:
- id: black

Expand All @@ -33,11 +33,6 @@ repos:
- id: flake8
args: [--show-source, --statistics]

- repo: https://github.com/asottile/seed-isort-config
rev: v2.2.0
hooks:
- id: seed-isort-config

- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/datasets/wikitext2_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
# LICENSE file in the root directory of this source tree.

from collections import namedtuple
from distutils.version import LooseVersion
import io
import operator
import tempfile

from distutils.version import LooseVersion
import torch
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/experimental/offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,11 @@ def train_epoch(args, num_iters):
loss.backward()
optimizer.step()
logging.info(
"Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2 ** 30)
"Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2**30)
)
logging.info(
"Loss {:.2f} - throughput {:.2f}fps".format(
loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9
loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10**9
)
)
num_iters -= 1
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
print("Throughput(wps) is {:.2f}.".format(wps))
print(
"Peak allocated bytes on cuda:{}: {:4f}GB".format(
dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2 ** 30
dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2**30
)
)

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/golden_configs/lm_wikitext2.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def get_golden_synthetic_stats():
return {
"avg_wps": 486.303,
"std_dev_wps": 71.307,
"peak_mem_usage": [5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30],
"peak_mem_usage": [5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30],
}


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def validate_benchmark(measurements, final_loss, args, check_regression):
if not args.cpu:
# TODO(anj-s): Check if we need to synchronize before we caculate total training time.
torch.cuda.synchronize(rank)
max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20
max_memory = torch.cuda.max_memory_allocated(rank) / 2**20
logging.info(f"[{rank}] : Peak memory {max_memory:.1f}MiB")

measurements.sort()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ class DynamicDirectedExponentialGraph(GraphManager):
def _make_graph(self) -> None:
for rank in range(self.world_size):
for i in range(0, int(mlog(self.world_size - 1, 2)) + 1):
f_peer = self._rotate_forward(rank, 2 ** i)
b_peer = self._rotate_backward(rank, 2 ** i)
f_peer = self._rotate_forward(rank, 2**i)
b_peer = self._rotate_backward(rank, 2**i)
self._add_peers(rank, [f_peer, b_peer])

def is_regular_graph(self) -> bool:
Expand Down Expand Up @@ -196,8 +196,8 @@ def _make_graph(self) -> None:
f_peer = self._rotate_forward(rank, 1)
b_peer = self._rotate_backward(rank, 1)
else:
f_peer = self._rotate_forward(rank, 1 + 2 ** i)
b_peer = self._rotate_backward(rank, 1 + 2 ** i)
f_peer = self._rotate_forward(rank, 1 + 2**i)
b_peer = self._rotate_backward(rank, 1 + 2**i)
# create directory for non-passive peers
if not self.is_passive(rank) and (self.is_passive(f_peer) and self.is_passive(b_peer)):
self._add_peers(rank, [f_peer, b_peer])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import torch

MAX_LEN_DEQUEUE = 10 ** 4
MAX_LEN_DEQUEUE = 10**4
deque_with_max_len_fixed = partial(deque, maxlen=MAX_LEN_DEQUEUE)


Expand Down
2 changes: 1 addition & 1 deletion fairscale/experimental/optim/dynamic_loss_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class DynamicLossScaler(object):

def __init__(
self,
init_scale: float = 2.0 ** 15,
init_scale: float = 2.0**15,
scale_factor: float = 2.0,
scale_window: int = 2000,
tolerance: float = 0.0,
Expand Down
4 changes: 2 additions & 2 deletions fairscale/nn/data_parallel/fully_sharded_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,7 @@ def clip_grad_norm_(
total_norm = local_norm
dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
else:
total_norm = local_norm ** norm_type
total_norm = local_norm**norm_type
dist.all_reduce(total_norm, group=self.process_group)
total_norm = total_norm ** (1.0 / norm_type)

Expand Down Expand Up @@ -2408,7 +2408,7 @@ def _print_r0(self, msg: str, restart: bool = False) -> None:
if restart:
self._tstart = time.time()
if self.rank == 0:
gb_denom = 1024 ** 3
gb_denom = 1024**3
logging.info(
f"{msg} cur={torch.cuda.memory_allocated()/gb_denom: .4f} GB, max={torch.cuda.max_memory_allocated()/gb_denom: .4f} GB, t={time.time()-self._tstart: .1f}"
)
Expand Down
4 changes: 2 additions & 2 deletions fairscale/nn/data_parallel/sharded_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def __init__(
process_group: Any = None,
broadcast_buffers: bool = True,
sync_models_at_startup: bool = True,
reduce_buffer_size: int = 2 ** 23,
reduce_buffer_size: int = 2**23,
auto_refresh_trainable: bool = True,
reduce_fp16: bool = False,
warn_on_trainable_params_changed: bool = True,
Expand Down Expand Up @@ -178,7 +178,7 @@ def __init__(

logging.info(
"ShardedDDP bucket size: {:.2f}M parameters, model size {:.2f}M parameters".format(
self._buffer_max_size / 2 ** 20, model_size / 2 ** 20
self._buffer_max_size / 2**20, model_size / 2**20
)
)
self._use_buckets = self._buffer_max_size > 0
Expand Down
4 changes: 2 additions & 2 deletions fairscale/nn/pipe/batchnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _track(self, input: Tensor) -> bool:

with torch.no_grad():
self.sum += input.sum(dim)
self.sum_squares += (input ** 2).sum(dim)
self.sum_squares += (input**2).sum(dim)

size = input.size().numel() // input.size(1)
self.counter += size
Expand All @@ -89,7 +89,7 @@ def _commit(self) -> None:
exponential_average_factor = self.momentum

mean = self.sum / self.counter
var = self.sum_squares / self.counter - mean ** 2
var = self.sum_squares / self.counter - mean**2

# Calculate the exponential moving average here.
m = exponential_average_factor
Expand Down
5 changes: 2 additions & 3 deletions fairscale/optim/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(
assert parameters[0].dtype == torch.float16

self.optim_type = torch.float16 if precision is Precision.PURE_FP16 else torch.float32
self._optim_scale = float(2 ** 16) if precision is Precision.PURE_FP16 else 1.0
self._optim_scale = float(2**16) if precision is Precision.PURE_FP16 else 1.0
self._steps_since_optim_scale_change = 0
self._optim_scale_update_freq = 2000 # This is the value that GradScaler uses by default
self._overflow_buf = torch.cuda.IntTensor([0]) # type: ignore
Expand Down Expand Up @@ -291,11 +291,10 @@ def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]

if self._steps_since_optim_scale_change == self._optim_scale_update_freq:
self._steps_since_optim_scale_change = 0
if self._optim_scale < 2 ** 16:
if self._optim_scale < 2**16:
self._optim_scale *= 2

return loss


except ImportError:
pass
2 changes: 1 addition & 1 deletion fairscale/optim/adascale.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def _final_callback(self) -> None:
# accumulation.
if self._num_grads_to_accum > 1:
# np array doesn't support /=.
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum ** 2)
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)

# Wait for all_reduce to be done and move it to cpu & np.
if work:
Expand Down
2 changes: 1 addition & 1 deletion fairscale/optim/grad_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class ShardedGradScaler(TorchGradScaler):

def __init__(
self,
init_scale: float = 2.0 ** 16,
init_scale: float = 2.0**16,
growth_factor: float = 2.0,
backoff_factor: float = 0.5,
growth_interval: int = 2000,
Expand Down
2 changes: 1 addition & 1 deletion fairscale/optim/oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def clip_grad_norm(
# n_i = sum_rank(a^p)^1/p
# -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p
# all reduce over data parallel and model parallel workers
total_norm = local_norm ** norm_type
total_norm = local_norm**norm_type
dist.all_reduce(total_norm)
total_norm = total_norm ** (1.0 / norm_type)

Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,3 @@ use_parentheses = true
skip_glob = ["build/*", "stubs/*"]
# Don't split "import" and "from".
force_sort_within_sections = true
known_third_party = ["benchmark_dataset", "datasets", "distutils", "golden_configs", "models", "numpy", "parameterized", "pytest", "recommonmark", "setuptools", "sklearn", "torch", "torchtext", "torchvision", "utils"]
3 changes: 1 addition & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
# function typing with mypy.
# - if you change versions below, please make sure it is in-sync with
# .pre-commit-config.yaml for pre-commit.
black == 21.10b0
black == 22.3.0
flake8 == 4.0.1
flake8-annotations == 2.7.0
isort == 5.10.1
seed-isort-config == 2.2.0
mypy == 0.910
pre-commit >= 2.15.0

Expand Down
4 changes: 2 additions & 2 deletions tests/nn/data_parallel/test_fsdp_optimizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,13 @@ def _test_consolidated_optimizer(
assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}"
assert fsdp._fsdp_instances[-1].no_broadcast_optim_state
torch.cuda.empty_cache()
cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
tstart = time()
sd = fsdp.gather_full_optim_state_dict(fsdp_optim, recipient_rank=0)
duration = time() - tstart
assert duration < fsdp.world_size, f"gather optim state took {duration} seconds, suspect change in _consolidate"

cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
mem_usg_gb = cuda_gb_after - cuda_gb_before
assert mem_usg_gb == 0, f"gather_full_optim_state_dict used {mem_usg_gb:.2f} CUDA GB, max allowed is 0"
assert cuda_gb_after > 0, "got 0 memory usage, logging is broken"
Expand Down
12 changes: 6 additions & 6 deletions tests/nn/data_parallel/test_sharded_ddp_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def run_test(backend, device, world_size, broadcast_buffers, grad_accumulation,
@skip_if_single_gpu
@pytest.mark.parametrize("broadcast_buffers", [True, False])
@pytest.mark.parametrize("grad_accumulation", [True, False])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("optimizer_type", [torch.optim.SGD, SGDWithPausingCompute])
@pytest.mark.parametrize("reduce_fp16", [False, True])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -204,7 +204,7 @@ def closure():
dist.destroy_process_group()


@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("backend", ["gloo", "nccl"])
@pytest.mark.parametrize("device", available_devices)
@skip_if_single_gpu
Expand Down Expand Up @@ -354,7 +354,7 @@ def run_test_device_change(rank, world_size, backend, device, temp_file_name, re

@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_device_change(reduce_buffer_size):
# Check that ShardedDDP handles a device change properly
world_size = 2
Expand Down Expand Up @@ -392,7 +392,7 @@ def run_test_training_change(rank, world_size, backend, device, temp_file_name,

@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_training_change(reduce_buffer_size):
world_size = 2
backend = "nccl"
Expand Down Expand Up @@ -528,7 +528,7 @@ def closure():
@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("world_size", [1, 2])
@pytest.mark.parametrize("reduce_buffer", [2 ** 23, 2 ** 40])
@pytest.mark.parametrize("reduce_buffer", [2**23, 2**40])
def test_gpt2(world_size, reduce_buffer):
# Check that having trainable unused params is fine
backend = "gloo"
Expand Down Expand Up @@ -598,7 +598,7 @@ def closure():


@skip_if_less_than_four_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("backend", ["gloo", "nccl"])
def test_multiple_groups(reduce_buffer_size, backend):
world_size = 4
Expand Down
4 changes: 2 additions & 2 deletions tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def sharded_closure(input_tensor=input_tensor):

@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
@pytest.mark.parametrize("grad_accumulation", [True, False])
@pytest.mark.parametrize("change_train_graph", [True, False])
@pytest.mark.parametrize("fp16_reduction", _test_fp16_reduction)
Expand Down Expand Up @@ -347,7 +347,7 @@ def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name, reduce_b

@skip_if_no_cuda
@skip_if_single_gpu
@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
def test_ddp_parity_two_optim(reduce_buffer_size):
world_size = 2
backend = dist.Backend.NCCL
Expand Down
4 changes: 2 additions & 2 deletions tests/nn/pipe/test_deferred_batch_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def tilt_dist(input):

# Tilt mean by single batch.
for i, single in enumerate(input):
single += 2 ** i
single += 2**i

return input

Expand Down Expand Up @@ -150,7 +150,7 @@ def test_optimize():
dbn.eval()

with torch.no_grad():
assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10 ** i))
assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10**i))


def test_conv_bn():
Expand Down
4 changes: 2 additions & 2 deletions tests/optim/test_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,14 +311,14 @@ def test_update_optim_scale():
weight, bias, input = make_half_precision_params()
optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)
optimizer._optim_scale_update_freq = 1
optimizer._optim_scale = 2 ** 15
optimizer._optim_scale = 2**15

optimizer.zero_grad()
loss = (weight.mv(input) + bias).pow(2).sum()
loss.backward()
optimizer.step()

assert optimizer._optim_scale == 2 ** 16
assert optimizer._optim_scale == 2**16


@skip_if_no_cuda
Expand Down
4 changes: 2 additions & 2 deletions tests/optim/test_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ def closure():

# With SGD, Momentum is required to get a state to shard
optimizer = optim.OSS(
model.parameters(), lr=0.1, momentum=0.99, group=process_group, broadcast_buffer_size=2 ** 20
model.parameters(), lr=0.1, momentum=0.99, group=process_group, broadcast_buffer_size=2**20
)
check(optimizer)

Expand Down Expand Up @@ -875,7 +875,7 @@ def check_optimizer_equivalence(optimizer: Type[torch.optim.Optimizer], change_t
params=oss_trainable_params,
optim=optimizer,
group=None,
broadcast_buffer_size=2 ** 10,
broadcast_buffer_size=2**10,
**optimizer_settings,
)

Expand Down
4 changes: 2 additions & 2 deletions tests/optim/test_single_node_adascale.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test_lr_scheduler():
model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
# We use 1, not 0.1 here since scheduler.step() is called here first.
scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10 ** epoch)
scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10**epoch)
for epoch in range(3):
for data_idx in range(10):
for accumulation in range(3):
Expand All @@ -186,7 +186,7 @@ def test_lr_scheduler():
optim.step()
optim.zero_grad()
# asserting LR is right
assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"]
assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10**epoch), optim.param_groups[0]["lr"]
scheduler.step()
# asserting LR is right
assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]
Expand Down

0 comments on commit 72f373c

Please sign in to comment.