Remove sort_iseed_config and related dependencies. (#969)

This is no longer needed since isort's version is 5.10 Also fix black version to 22.3.0 to fix issue with click dependency. Update files that now fail with new version of black {a = 2 ** 4} -> {a = 2**4}
facebookresearch · Mar 30, 2022 · 72f373c · 72f373c
1 parent 1bc96fa
commit 72f373c
Show file tree

Hide file tree

Showing 25 changed files with 43 additions and 51 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     -   id: end-of-file-fixer
 
 -   repo: https://github.com/ambv/black
-    rev: 21.10b0
+    rev: 22.3.0
     hooks:
     - id: black
 
@@ -33,11 +33,6 @@ repos:
     -   id: flake8
         args: [--show-source, --statistics]
 
--   repo: https://github.com/asottile/seed-isort-config
-    rev: v2.2.0
-    hooks:
-    -   id: seed-isort-config
-
 -   repo: https://github.com/pycqa/isort
     rev: 5.10.1
     hooks:

diff --git a/benchmarks/datasets/wikitext2_data.py b/benchmarks/datasets/wikitext2_data.py
@@ -4,11 +4,11 @@
 # LICENSE file in the root directory of this source tree.
 
 from collections import namedtuple
+from distutils.version import LooseVersion
 import io
 import operator
 import tempfile
 
-from distutils.version import LooseVersion
 import torch
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler

diff --git a/benchmarks/experimental/offload.py b/benchmarks/experimental/offload.py
@@ -135,11 +135,11 @@ def train_epoch(args, num_iters):
                         loss.backward()
                     optimizer.step()
             logging.info(
-                "Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2 ** 30)
+                "Memory stats are {:.2f}GB".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] / 2**30)
             )
             logging.info(
                 "Loss {:.2f} - throughput {:.2f}fps".format(
-                    loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9
+                    loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10**9
                 )
             )
             num_iters -= 1

diff --git a/benchmarks/fsdp.py b/benchmarks/fsdp.py
@@ -267,7 +267,7 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
         print("Throughput(wps) is {:.2f}.".format(wps))
     print(
         "Peak allocated bytes on cuda:{}: {:4f}GB".format(
-            dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2 ** 30
+            dist.get_rank(), torch.cuda.memory_stats(dist.get_rank())["allocated_bytes.all.peak"] / 2**30
         )
     )
 

diff --git a/benchmarks/golden_configs/lm_wikitext2.py b/benchmarks/golden_configs/lm_wikitext2.py
@@ -97,7 +97,7 @@ def get_golden_synthetic_stats():
         return {
             "avg_wps": 486.303,
             "std_dev_wps": 71.307,
-            "peak_mem_usage": [5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30, 5.5055 * 2 ** 30],
+            "peak_mem_usage": [5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30],
         }
 
 

diff --git a/benchmarks/oss.py b/benchmarks/oss.py
@@ -89,7 +89,7 @@ def validate_benchmark(measurements, final_loss, args, check_regression):
     if not args.cpu:
         # TODO(anj-s): Check if we need to synchronize before we caculate total training time.
         torch.cuda.synchronize(rank)
-        max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20
+        max_memory = torch.cuda.max_memory_allocated(rank) / 2**20
         logging.info(f"[{rank}] : Peak memory {max_memory:.1f}MiB")
 
     measurements.sort()

diff --git a/fairscale/experimental/nn/data_parallel/gossip/graph_manager.py b/fairscale/experimental/nn/data_parallel/gossip/graph_manager.py
@@ -149,8 +149,8 @@ class DynamicDirectedExponentialGraph(GraphManager):
     def _make_graph(self) -> None:
         for rank in range(self.world_size):
             for i in range(0, int(mlog(self.world_size - 1, 2)) + 1):
-                f_peer = self._rotate_forward(rank, 2 ** i)
-                b_peer = self._rotate_backward(rank, 2 ** i)
+                f_peer = self._rotate_forward(rank, 2**i)
+                b_peer = self._rotate_backward(rank, 2**i)
                 self._add_peers(rank, [f_peer, b_peer])
 
     def is_regular_graph(self) -> bool:
@@ -196,8 +196,8 @@ def _make_graph(self) -> None:
                     f_peer = self._rotate_forward(rank, 1)
                     b_peer = self._rotate_backward(rank, 1)
                 else:
-                    f_peer = self._rotate_forward(rank, 1 + 2 ** i)
-                    b_peer = self._rotate_backward(rank, 1 + 2 ** i)
+                    f_peer = self._rotate_forward(rank, 1 + 2**i)
+                    b_peer = self._rotate_backward(rank, 1 + 2**i)
                 # create directory for non-passive peers
                 if not self.is_passive(rank) and (self.is_passive(f_peer) and self.is_passive(b_peer)):
                     self._add_peers(rank, [f_peer, b_peer])

diff --git a/fairscale/experimental/nn/data_parallel/gossip/utils/cuda_metering.py b/fairscale/experimental/nn/data_parallel/gossip/utils/cuda_metering.py
@@ -14,7 +14,7 @@
 
 import torch
 
-MAX_LEN_DEQUEUE = 10 ** 4
+MAX_LEN_DEQUEUE = 10**4
 deque_with_max_len_fixed = partial(deque, maxlen=MAX_LEN_DEQUEUE)
 
 

diff --git a/fairscale/experimental/optim/dynamic_loss_scaler.py b/fairscale/experimental/optim/dynamic_loss_scaler.py
@@ -36,7 +36,7 @@ class DynamicLossScaler(object):
 
     def __init__(
         self,
-        init_scale: float = 2.0 ** 15,
+        init_scale: float = 2.0**15,
         scale_factor: float = 2.0,
         scale_window: int = 2000,
         tolerance: float = 0.0,

diff --git a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
@@ -700,7 +700,7 @@ def clip_grad_norm_(
             total_norm = local_norm
             dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
         else:
-            total_norm = local_norm ** norm_type
+            total_norm = local_norm**norm_type
             dist.all_reduce(total_norm, group=self.process_group)
             total_norm = total_norm ** (1.0 / norm_type)
 
@@ -2408,7 +2408,7 @@ def _print_r0(self, msg: str, restart: bool = False) -> None:
         if restart:
             self._tstart = time.time()
         if self.rank == 0:
-            gb_denom = 1024 ** 3
+            gb_denom = 1024**3
             logging.info(
                 f"{msg} cur={torch.cuda.memory_allocated()/gb_denom: .4f} GB, max={torch.cuda.max_memory_allocated()/gb_denom: .4f} GB, t={time.time()-self._tstart: .1f}"
             )

diff --git a/fairscale/nn/data_parallel/sharded_ddp.py b/fairscale/nn/data_parallel/sharded_ddp.py
@@ -100,7 +100,7 @@ def __init__(
         process_group: Any = None,
         broadcast_buffers: bool = True,
         sync_models_at_startup: bool = True,
-        reduce_buffer_size: int = 2 ** 23,
+        reduce_buffer_size: int = 2**23,
         auto_refresh_trainable: bool = True,
         reduce_fp16: bool = False,
         warn_on_trainable_params_changed: bool = True,
@@ -178,7 +178,7 @@ def __init__(
 
         logging.info(
             "ShardedDDP bucket size: {:.2f}M parameters, model size {:.2f}M parameters".format(
-                self._buffer_max_size / 2 ** 20, model_size / 2 ** 20
+                self._buffer_max_size / 2**20, model_size / 2**20
             )
         )
         self._use_buckets = self._buffer_max_size > 0

diff --git a/fairscale/nn/pipe/batchnorm.py b/fairscale/nn/pipe/batchnorm.py
@@ -71,7 +71,7 @@ def _track(self, input: Tensor) -> bool:
 
         with torch.no_grad():
             self.sum += input.sum(dim)
-            self.sum_squares += (input ** 2).sum(dim)
+            self.sum_squares += (input**2).sum(dim)
 
         size = input.size().numel() // input.size(1)
         self.counter += size
@@ -89,7 +89,7 @@ def _commit(self) -> None:
             exponential_average_factor = self.momentum
 
         mean = self.sum / self.counter
-        var = self.sum_squares / self.counter - mean ** 2
+        var = self.sum_squares / self.counter - mean**2
 
         # Calculate the exponential moving average here.
         m = exponential_average_factor

diff --git a/fairscale/optim/adam.py b/fairscale/optim/adam.py
@@ -98,7 +98,7 @@ def __init__(
                 assert parameters[0].dtype == torch.float16
 
             self.optim_type = torch.float16 if precision is Precision.PURE_FP16 else torch.float32
-            self._optim_scale = float(2 ** 16) if precision is Precision.PURE_FP16 else 1.0
+            self._optim_scale = float(2**16) if precision is Precision.PURE_FP16 else 1.0
             self._steps_since_optim_scale_change = 0
             self._optim_scale_update_freq = 2000  # This is the value that GradScaler uses by default
             self._overflow_buf = torch.cuda.IntTensor([0])  # type: ignore
@@ -291,11 +291,10 @@ def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]
 
                 if self._steps_since_optim_scale_change == self._optim_scale_update_freq:
                     self._steps_since_optim_scale_change = 0
-                    if self._optim_scale < 2 ** 16:
+                    if self._optim_scale < 2**16:
                         self._optim_scale *= 2
 
             return loss
 
-
 except ImportError:
     pass
diff --git a/fairscale/optim/adascale.py b/fairscale/optim/adascale.py
@@ -453,7 +453,7 @@ def _final_callback(self) -> None:
         # accumulation.
         if self._num_grads_to_accum > 1:
             # np array doesn't support /=.
-            total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum ** 2)
+            total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
 
         # Wait for all_reduce to be done and move it to cpu & np.
         if work:

diff --git a/fairscale/optim/grad_scaler.py b/fairscale/optim/grad_scaler.py
@@ -76,7 +76,7 @@ class ShardedGradScaler(TorchGradScaler):
 
     def __init__(
         self,
-        init_scale: float = 2.0 ** 16,
+        init_scale: float = 2.0**16,
         growth_factor: float = 2.0,
         backoff_factor: float = 0.5,
         growth_interval: int = 2000,

diff --git a/fairscale/optim/oss.py b/fairscale/optim/oss.py
@@ -289,7 +289,7 @@ def clip_grad_norm(
                 # n_i = sum_rank(a^p)^1/p
                 # -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p
                 # all reduce over data parallel and model parallel workers
-                total_norm = local_norm ** norm_type
+                total_norm = local_norm**norm_type
                 dist.all_reduce(total_norm)
                 total_norm = total_norm ** (1.0 / norm_type)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,4 +27,3 @@ use_parentheses = true
 skip_glob = ["build/*", "stubs/*"]
 # Don't split "import" and "from".
 force_sort_within_sections = true
-known_third_party = ["benchmark_dataset", "datasets", "distutils", "golden_configs", "models", "numpy", "parameterized", "pytest", "recommonmark", "setuptools", "sklearn", "torch", "torchtext", "torchvision", "utils"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,11 +6,10 @@
 #     function typing with mypy.
 #   - if you change versions below, please make sure it is in-sync with
 #     .pre-commit-config.yaml for pre-commit.
-black == 21.10b0
+black == 22.3.0
 flake8 == 4.0.1
 flake8-annotations == 2.7.0
 isort == 5.10.1
-seed-isort-config == 2.2.0
 mypy == 0.910
 pre-commit >= 2.15.0
 

diff --git a/tests/nn/data_parallel/test_fsdp_optimizer_utils.py b/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
@@ -162,13 +162,13 @@ def _test_consolidated_optimizer(
             assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}"
             assert fsdp._fsdp_instances[-1].no_broadcast_optim_state
         torch.cuda.empty_cache()
-        cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
+        cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
         tstart = time()
         sd = fsdp.gather_full_optim_state_dict(fsdp_optim, recipient_rank=0)
         duration = time() - tstart
         assert duration < fsdp.world_size, f"gather optim state took {duration} seconds, suspect change in _consolidate"
 
-        cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024 ** 3
+        cuda_gb_after = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
         mem_usg_gb = cuda_gb_after - cuda_gb_before
         assert mem_usg_gb == 0, f"gather_full_optim_state_dict used {mem_usg_gb:.2f} CUDA GB, max allowed is 0"
         assert cuda_gb_after > 0, "got 0 memory usage, logging is broken"

diff --git a/tests/nn/data_parallel/test_sharded_ddp_features.py b/tests/nn/data_parallel/test_sharded_ddp_features.py
@@ -146,7 +146,7 @@ def run_test(backend, device, world_size, broadcast_buffers, grad_accumulation,
 @skip_if_single_gpu
 @pytest.mark.parametrize("broadcast_buffers", [True, False])
 @pytest.mark.parametrize("grad_accumulation", [True, False])
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("optimizer_type", [torch.optim.SGD, SGDWithPausingCompute])
 @pytest.mark.parametrize("reduce_fp16", [False, True])
 @pytest.mark.parametrize(
@@ -204,7 +204,7 @@ def closure():
     dist.destroy_process_group()
 
 
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("backend", ["gloo", "nccl"])
 @pytest.mark.parametrize("device", available_devices)
 @skip_if_single_gpu
@@ -354,7 +354,7 @@ def run_test_device_change(rank, world_size, backend, device, temp_file_name, re
 
 @skip_if_no_cuda
 @skip_if_single_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 def test_device_change(reduce_buffer_size):
     # Check that ShardedDDP handles a device change properly
     world_size = 2
@@ -392,7 +392,7 @@ def run_test_training_change(rank, world_size, backend, device, temp_file_name,
 
 @skip_if_no_cuda
 @skip_if_single_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 def test_training_change(reduce_buffer_size):
     world_size = 2
     backend = "nccl"
@@ -528,7 +528,7 @@ def closure():
 @skip_if_no_cuda
 @skip_if_single_gpu
 @pytest.mark.parametrize("world_size", [1, 2])
-@pytest.mark.parametrize("reduce_buffer", [2 ** 23, 2 ** 40])
+@pytest.mark.parametrize("reduce_buffer", [2**23, 2**40])
 def test_gpt2(world_size, reduce_buffer):
     # Check that having trainable unused params is fine
     backend = "gloo"
@@ -598,7 +598,7 @@ def closure():
 
 
 @skip_if_less_than_four_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("backend", ["gloo", "nccl"])
 def test_multiple_groups(reduce_buffer_size, backend):
     world_size = 4

diff --git a/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py b/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
@@ -233,7 +233,7 @@ def sharded_closure(input_tensor=input_tensor):
 
 @skip_if_no_cuda
 @skip_if_single_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 @pytest.mark.parametrize("grad_accumulation", [True, False])
 @pytest.mark.parametrize("change_train_graph", [True, False])
 @pytest.mark.parametrize("fp16_reduction", _test_fp16_reduction)
@@ -347,7 +347,7 @@ def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name, reduce_b
 
 @skip_if_no_cuda
 @skip_if_single_gpu
-@pytest.mark.parametrize("reduce_buffer_size", [0, 2 ** 20])
+@pytest.mark.parametrize("reduce_buffer_size", [0, 2**20])
 def test_ddp_parity_two_optim(reduce_buffer_size):
     world_size = 2
     backend = dist.Backend.NCCL

diff --git a/tests/nn/pipe/test_deferred_batch_norm.py b/tests/nn/pipe/test_deferred_batch_norm.py
@@ -38,7 +38,7 @@ def tilt_dist(input):
 
     # Tilt mean by single batch.
     for i, single in enumerate(input):
-        single += 2 ** i
+        single += 2**i
 
     return input
 
@@ -150,7 +150,7 @@ def test_optimize():
         dbn.eval()
 
         with torch.no_grad():
-            assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10 ** i))
+            assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10**i))
 
 
 def test_conv_bn():

diff --git a/tests/optim/test_adam.py b/tests/optim/test_adam.py
@@ -311,14 +311,14 @@ def test_update_optim_scale():
     weight, bias, input = make_half_precision_params()
     optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)
     optimizer._optim_scale_update_freq = 1
-    optimizer._optim_scale = 2 ** 15
+    optimizer._optim_scale = 2**15
 
     optimizer.zero_grad()
     loss = (weight.mv(input) + bias).pow(2).sum()
     loss.backward()
     optimizer.step()
 
-    assert optimizer._optim_scale == 2 ** 16
+    assert optimizer._optim_scale == 2**16
 
 
 @skip_if_no_cuda

diff --git a/tests/optim/test_oss.py b/tests/optim/test_oss.py
@@ -602,7 +602,7 @@ def closure():
 
         # With SGD, Momentum is required to get a state to shard
         optimizer = optim.OSS(
-            model.parameters(), lr=0.1, momentum=0.99, group=process_group, broadcast_buffer_size=2 ** 20
+            model.parameters(), lr=0.1, momentum=0.99, group=process_group, broadcast_buffer_size=2**20
         )
         check(optimizer)
 
@@ -875,7 +875,7 @@ def check_optimizer_equivalence(optimizer: Type[torch.optim.Optimizer], change_t
             params=oss_trainable_params,
             optim=optimizer,
             group=None,
-            broadcast_buffer_size=2 ** 10,
+            broadcast_buffer_size=2**10,
             **optimizer_settings,
         )
 

diff --git a/tests/optim/test_single_node_adascale.py b/tests/optim/test_single_node_adascale.py
@@ -175,7 +175,7 @@ def test_lr_scheduler():
     model = Linear(2, 2, bias=False)
     optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
     # We use 1, not 0.1 here since scheduler.step() is called here first.
-    scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10 ** epoch)
+    scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10**epoch)
     for epoch in range(3):
         for data_idx in range(10):
             for accumulation in range(3):
@@ -186,7 +186,7 @@ def test_lr_scheduler():
             optim.step()
             optim.zero_grad()
             # asserting LR is right
-            assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"]
+            assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10**epoch), optim.param_groups[0]["lr"]
         scheduler.step()
         # asserting LR is right
         assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]