clip_grad_global_norm with zeros max_grad_norm

dmlc · Jul 4, 2020 · 0e13a58 · 0e13a58
1 parent bd270f2
commit 0e13a58
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 6 deletions.
diff --git a/scripts/pretraining/run_electra.py b/scripts/pretraining/run_electra.py
@@ -448,7 +448,7 @@ def train(args):
         # We need to change the ratio to be
         #  \sum_{n=1}^N g_n / loss_denom  -->  clip to args.max_grad_norm  * N / loss_denom
         total_norm, ratio, is_finite = clip_grad_global_norm(
-            params, args.max_grad_norm * num_samples_per_update / loss_denom)
+            params, args.max_grad_norm, loss_denom / num_samples_per_update)
         total_norm = total_norm / (num_samples_per_update / loss_denom)
         trainer.update(num_samples_per_update / loss_denom, ignore_stale_grad=True)
         step_num += 1

diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
@@ -572,7 +572,7 @@ def train(args):
         # We need to change the ratio to be
         #  \sum_{n=1}^N g_n / loss_denom  -->  clip to args.max_grad_norm  * N / loss_denom
         total_norm, ratio, is_finite = clip_grad_global_norm(
-            params, args.max_grad_norm * num_samples_per_update / loss_denom)
+            params, args.max_grad_norm, loss_denom / num_samples_per_update)
         total_norm = total_norm / (num_samples_per_update / loss_denom)
 
         trainer.update(num_samples_per_update / loss_denom, ignore_stale_grad=True)

diff --git a/src/gluonnlp/utils/parameter.py b/src/gluonnlp/utils/parameter.py
@@ -94,7 +94,8 @@ def grad_global_norm(parameters: Iterable[Parameter]) -> float:
 
 
 def clip_grad_global_norm(parameters: Iterable[Parameter],
-                          max_norm: float,
+                          max_grad_norm: float, 
+                          multiplier: float = 1.0,
                           check_isfinite: bool = True) -> Tuple[float, float, bool]:
     """Rescales gradients of parameters so that the sum of their 2-norm is smaller than `max_norm`.
     If gradients exist for more than one context for a parameter, user needs to explicitly call
@@ -123,8 +124,10 @@ def clip_grad_global_norm(parameters: Iterable[Parameter],
     ----------
     parameters
         The list of parameters to calculate the norm
-    max_norm
+    max_grad_norm
         If the gradient norm is larger than max_norm, it will be clipped to have max_norm
+    multiplier
+        Constant multiplier to scale the gradient
     check_isfinite
          If True, check whether the total_norm is finite (not nan or inf).
 
@@ -140,14 +143,20 @@ def clip_grad_global_norm(parameters: Iterable[Parameter],
     """
     total_norm = grad_global_norm(parameters)
     is_finite = bool(np.isfinite(total_norm))
-    ratio = np.maximum(1, total_norm / max_norm)
+    if max_grad_norm > 0:
+        ratio = np.maximum(1, total_norm / (max_grad_norm / multiplier))
+        scale = 1 / ratio
+    else:
+        scale = multiplier
+        ratio = float('nan')
+
     if check_isfinite and not is_finite:
         warnings.warn(
             UserWarning('nan or inf is detected. Clipping results will be undefined.'
                         ' Thus, skip clipping'),
             stacklevel=2)
         return total_norm, ratio, is_finite
-    scale = 1 / ratio
+
     for p in parameters:
         if p.grad_req != 'null':
             for arr in p.list_grad():