fix typo in comments under torch/csrc/distributed (#96062)

This PR fixes typos in comments and messages of `.cpp` and `.hpp` files under `torch/csrc/distributed` directory Pull Request resolved: pytorch/pytorch#96062 Approved by: https://github.com/ngimel
cyyever · Mar 12, 2023 · 1addf8d · 1addf8d
1 parent b074b4c
commit 1addf8d
Show file tree

Hide file tree

Showing 22 changed files with 32 additions and 32 deletions.
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -31,7 +31,7 @@ static constexpr const char* kNumAutogradContexts = "num_autograd_contexts";
 
 // This hook does 3 things:
 //   1. Call pre hooks of the original AccumulateGrad to modify the input grad.
-//   2. Accumuate the guard to RPC context.
+//   2. Accumurate the guard to RPC context.
 //   3. Call post hooks of the original AccumulateGrad.
 class DistAccumulateGradCaptureHook
     : public GraphTask::ExecInfo::Capture::GradCaptureHook {

diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.h b/torch/csrc/distributed/autograd/engine/dist_engine.h
@@ -96,7 +96,7 @@ class TORCH_API DistEngine {
   // traverse the GraphTask instead of using the GraphTask embedded
   // cpu_ready_queue, this is because dist engine might run the same GraphTask
   // from different SendFunctions concurrently in different threads. The method
-  // will only mark the GraphTask as completed when it needes to, which means it
+  // will only mark the GraphTask as completed when it needs to, which means it
   // might not mark as completed for every call as dist engine would like to
   // keep the GraphTask alive when it not receives all gradients.
   //

diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
@@ -18,7 +18,7 @@ class TORCH_API RpcWithProfilingResp : public rpc::RpcCommandBase {
       std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
       rpc::ProfilingId profilingId);
 
-  // For receving RPCs. Used in from message when converting a message received
+  // For receiving RPCs. Used in from message when converting a message received
   // over the wire.
   RpcWithProfilingResp(
       rpc::MessageType messageType,

diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
@@ -113,7 +113,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   }
 
   // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
-  // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
   // For implementers of ProcessGroup API and advanced users only.
   // Note: this function will be deprecated in near future.
   virtual c10::intrusive_ptr<Work> _allgather_base(

diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -226,7 +226,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   }
 
   // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
-  // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
   // For implementers of ProcessGroup API and advanced users only.
   // Note: this function will be deprecated in near future.
   virtual c10::intrusive_ptr<Work> _allgather_base(

diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -71,8 +71,8 @@ struct WorkEntry {
 // MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
 // group. In other words, no more than 1 process group can be created globally.
 //
-// If you would like to use multiple ProcessGroupMPI, it requres your MPI
-// implemenation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
+// If you would like to use multiple ProcessGroupMPI, it requires your MPI
+// implementation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
 // multiple threads may call MPI, with no restriction.
 //
 // Also note that ProcessGroupMPI only supports a single Tensor operation. In
@@ -229,7 +229,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
   c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-  // Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
+  // Creating a new ProcessGroupMPI, will initialize MPI if not initialized
   static c10::intrusive_ptr<ProcessGroupMPI> createProcessGroupMPI(
       std::vector<int> ranks = {});
 

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -499,7 +499,7 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
         // So explicitly abort ncclComms here before throwing this timed out
         // exception to users, after this, ncclCommWatchdog can detect nccl
         // communicators are aborted and clean up devNCCLCommMap_ accordingly.
-        // if throwing timed out excepiton without aborting nccl communicators
+        // if throwing timed out exception without aborting nccl communicators
         // here, it was observed that CUDA GPU will have 100% utilization and
         // can not run new events successfully.
 

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -136,7 +136,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     void synchronizeStreams();
 
     // Helper function used in CUDA Stream callbacks to complete WorkNCCL
-    // objects and throw exceptions when neeeded.
+    // objects and throw exceptions when needed.
     void handleNCCLGuard(ErrorHandlingMode asyncErrorHandling);
 
     // Helper function that checks if the NCCL kernels have finished
@@ -497,7 +497,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Helper that encapsulates work shared across point-to-point communication
   // primitives. It is the same structure as the helper used for collective
-  // communicaiton primitives.
+  // communication primitives.
   template <typename Fn>
   c10::intrusive_ptr<Work> pointToPoint(
       std::vector<at::Tensor>& tensor,

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1494,7 +1494,7 @@ that adds a prefix to each key inserted to the store.
           processGroup,
           "Options",
           R"(
-Base class for all processs group options implementations, such as the nccl
+Base class for all processes group options implementations, such as the nccl
 options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
 )")
           .def(
@@ -2096,7 +2096,7 @@ Example::
                 ``fut.then()`` will return another ``CUDAFuture`` that holds the return value of the
                 callback and a ``CUDAEvent`` that recorded the callback stream.
 
-                    1. For CPU work, ``fut.done()`` returns true when work has been complted and value()
+                    1. For CPU work, ``fut.done()`` returns true when work has been completed and value()
                        tensors are ready.
                     2. For GPU work, ``fut.done()`` returns true only whether the operation has been enqueued.
                     3. For mixed CPU-GPU work (e.g. sending GPU tensors with GLOO), ``fut.done()`` returns

diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
@@ -69,7 +69,7 @@ class TORCH_API Logger {
   );
   // Set stats that can be collected only during
   // training loop. It is called at the beginning of forward call
-  // to record the run time stats of sampled iterations that previouly ran.
+  // to record the run time stats of sampled iterations that previously ran.
   // GPU performance stats are collected only for single process
   // single device program and single device module right now.
   // TODO to support single process multiple devices and multi device modules,

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1178,7 +1178,7 @@ void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
         if (grad.defined() && !grad.is_alias_of(bucket_view)) {
           bucket_view.copy_(grad);
           grad = bucket_view;
-          // The grad is modefied and needs to be written back.
+          // The grad is modified and needs to be written back.
           return true;
         }
         // The grad is not modified and does not need to be written back.

diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
@@ -73,8 +73,8 @@ class TORCH_API Reducer {
   // a call to this function can simply be omitted.
   void prepare_for_backward(const std::vector<at::Tensor>& outputs);
 
-  // Called at the begginning of forward() inside DistributedDataParallel,
-  // right now it caputures the starting time of forward in each iteration.
+  // Called at the beginning of forward() inside DistributedDataParallel,
+  // right now it captures the starting time of forward in each iteration.
   void prepare_for_forward();
 
   // Returns the relative time in nanoseconds when gradients were ready,
@@ -153,7 +153,7 @@ class TORCH_API Reducer {
 
   // An function for users to set sample_rate of collecting
   // runtime stats. The time stats will be recorded for the
-  // first 10 iterations, after 10 iteratons time stats will be
+  // first 10 iterations, after 10 iterations time stats will be
   // recorded once every "sample_rate" training iterations.
   void set_ddp_runtime_logging_sample_rate(int sample_rate);
 
@@ -504,7 +504,7 @@ class TORCH_API Reducer {
   // Retrieves parameter names that have not been marked as ready as part of
   // previous iteration.
   std::vector<std::string> getUnmarkedParamsForIteration();
-  // Retrives parameter indices that have not been marked as ready as part of
+  // Retrieves parameter indices that have not been marked as ready as part of
   // previous iteration.
   std::vector<size_t> getUnmarkedParamIndicesForIteration();
   // Raises appropriate error if mark_variable_ready is called on the same

diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h
@@ -98,7 +98,7 @@ enum MessageType {
 //        to determine how to serialize them. This design is helpful for
 //        communicating super large tensors where serializing all the data at
 //        once leads to excessively large memory footprint. An implementation
-//        can then serialize and send tensors chunck-by-chunk, in the streaming
+//        can then serialize and send tensors chunk-by-chunk, in the streaming
 //        fashion.
 //    type (MessageType): type of the message.
 //    id (int64_t): message id, this is used to match request and response.

diff --git a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
@@ -76,7 +76,7 @@ TORCH_API extern mutexType currentStateStackEntryMutex;
 
 // This class is used to implement a stack of ``State``s.
 // It has 2 members.
-// One is `prevPtr`, a shared_ptr poiniting to previous elememnt in the
+// One is `prevPtr`, a shared_ptr pointing to previous element in the
 // stack.
 // The other is ``statePtr``, a shared_ptr pointing to ``State``.
 class StateStackEntry {

diff --git a/torch/csrc/distributed/rpc/request_callback.cpp b/torch/csrc/distributed/rpc/request_callback.cpp
@@ -14,7 +14,7 @@ c10::intrusive_ptr<JitFuture> RequestCallback::operator()(
     std::vector<c10::Stream> streams) const {
   // NB: cannot clear autograd context id here because the processMessage method
   // might pause waiting for all RRefs in the arguments to be confirmed by their
-  // owners and resumne processing in a different thread. Hence, the
+  // owners and resume processing in a different thread. Hence, the
   // thread_local context id needs to be set and cleared in the thread that
   // indeed carries out the processing logic.
   return processMessage(request, std::move(streams));

diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -125,7 +125,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackImpl::runPythonFunction(
     return asFuture(std::current_exception());
   }
 
-  // After sync exection or failed async execution return the value as-is.
+  // After sync execution or failed async execution return the value as-is.
   if (pythonRpcHandler.isRemoteException(result) || !isAsyncExecution) {
     return asFuture(
         c10::ivalue::ConcretePyObjectHolder::create(result),

diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -78,7 +78,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
           // of 10us.
           auto serverProcessGlobalProfilerStateStackEntryPtr =
               profiler::processglobal::StateStackEntry::current();
-          // If server global profiler is enabled, we futher pay the
+          // If server global profiler is enabled, we further pay the
           // cost of thread local profiler state initialization.
           if (serverProcessGlobalProfilerStateStackEntryPtr) {
             // Initialize thread-local profiler state from process-global

diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -178,7 +178,7 @@ void RpcAgent::retryExpiredRpcs() {
     }
 
     // If there are no more RPC's set to be retried at the current timepoint,
-    // we can remove the corresponsing unordered_set from the retry map.
+    // we can remove the corresponding unordered_set from the retry map.
     if (earliestRpcList.empty()) {
       rpcRetryMap_.erase(earliestTimeout);
     }

diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -32,7 +32,7 @@ using steady_clock_time_point =
     std::chrono::time_point<std::chrono::steady_clock>;
 // Input is qualified name string, output is JIT StrongTypePtr
 // Same as jit::TypeResolver, did not import jit::TypeResolver to here
-// because it could instroduce cyclic dependencies.
+// because it could introduce cyclic dependencies.
 using TypeResolver =
     std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
 
@@ -153,7 +153,7 @@ class TORCH_API RpcAgent {
       const DeviceMap& deviceMap = {}) = 0;
 
   // Retries sending the message up to maxRetries times until an ACK is
-  // receieved. The duration between consecutive sends is increased over
+  // received. The duration between consecutive sends is increased over
   // time using an exponential backoff algorithm.
   //
   // Sends ``message`` to the ``RpcAgent`` of id ``to`` and returns a
@@ -232,7 +232,7 @@ class TORCH_API RpcAgent {
   // Retrieve metrics as KV map
   virtual std::unordered_map<std::string, std::string> getMetrics() = 0;
 
-  // Retrive debug info in addition to metrics as KV map
+  // Retrieve debug info in addition to metrics as KV map
   virtual std::unordered_map<std::string, std::string> getDebugInfo();
 
   // Flag to control whether GIL wait times

diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
@@ -180,7 +180,7 @@ class TORCH_API RRefContext {
   // been confirmed (i.e. is no longer in the pendingUsers_ map).
   c10::intrusive_ptr<RRef> getPendingUser(const ForkId& forkId);
 
-  // Start recroding new pending UserRRefs. All pending UserRRefs introduced
+  // Start recording new pending UserRRefs. All pending UserRRefs introduced
   // after this point will be put into the thread_local userTable_, which will
   // then be consumed and cleared in waitForThreadLocalPendingRRefs().
   void recordThreadLocalPendingRRefs();
@@ -264,7 +264,7 @@ class TORCH_API RRefContext {
       RRefId::Hash>
       forks_;
 
-  // This cond var is used by deleteAllUsers(), a event notificaton is sent if
+  // This cond var is used by deleteAllUsers(), a event notification is sent if
   // number of pending UserRRef or UserRRef children is reduced, or
   // number of owned OwnerRRef is reduced.
   std::condition_variable deleteAllUsersCV_;

diff --git a/torch/csrc/distributed/rpc/rref_proto.h b/torch/csrc/distributed/rpc/rref_proto.h
@@ -111,7 +111,7 @@ class TORCH_API PythonRRefFetchRet final : public RRefFetchRet {
       const Message& message);
 };
 
-// UserRRef (regardless it's the creator or not) uses this message to notiify
+// UserRRef (regardless it's the creator or not) uses this message to notify
 // OwnerRRef on delete.
 class TORCH_API RRefUserDelete final : public ForkMessageBase {
  public:

diff --git a/torch/csrc/distributed/rpc/script_remote_call.h b/torch/csrc/distributed/rpc/script_remote_call.h
@@ -15,7 +15,7 @@ using torch::jit::Operator;
 // A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
 // builtin operator. Currently, it does not support using RRef as arguments yet.
 // Besides the operator and a vector of arguments, ScriptRemoteCall also
-// caontains the RRefId and the ForkId of the return value RRef.
+// contains the RRefId and the ForkId of the return value RRef.
 class TORCH_API ScriptRemoteCall final : public ScriptCall {
  public:
   // Constructor for builitin operator call.