Merge pull request #941 from Warlord-K:master

PiperOrigin-RevId: 479809453 Change-Id: Ib361b75bbf362a24225bd3c65571db1a1f2e76d5
google-deepmind · Oct 9, 2022 · be1e80e · be1e80e
2 parents 5a5a5b6 + 445cffd
commit be1e80e
Showing 1 changed file with 6 additions and 4 deletions.
diff --git a/open_spiel/python/algorithms/policy_gradient.py b/open_spiel/python/algorithms/policy_gradient.py
@@ -128,20 +128,21 @@ def __init__(self,
           to (128,), which produces a NN: [INPUT] -> [128] -> ReLU -> [OUTPUT].
       batch_size: int, batch size to use for Q and Pi learning. Defaults to 128.
       critic_learning_rate: float, learning rate used for Critic (Q or V).
-        Defaults to 0.001.
+        Defaults to 0.01.
       pi_learning_rate: float, learning rate used for Pi. Defaults to 0.001.
       entropy_cost: float, entropy cost used to multiply the entropy loss. Can
-        be set to None to skip entropy computation. Defaults to 0.001.
+        be set to None to skip entropy computation. Defaults to 0.01.
       num_critic_before_pi: int, number of Critic (Q or V) updates before each
         Pi update. Defaults to 8 (every 8th critic learning step, Pi also
         learns).
       additional_discount_factor: float, additional discount to compute returns.
         Defaults to 1.0, in which case, no extra discount is applied.  None that
         users must provide *only one of* `loss_str` or `loss_class`.
       max_global_gradient_norm: float or None, maximum global norm of a gradient
-        to which the gradient is shrunk if its value is larger.
+        to which the gradient is shrunk if its value is larger. Defaults to
+        None.
       optimizer_str: String defining which optimizer to use. Supported values
-        are {sgd, adam}
+        are {sgd, adam}. Defaults to sgd
     """
     assert bool(loss_str) ^ bool(loss_class), "Please provide only one option."
     self._kwargs = locals()
@@ -299,6 +300,7 @@ def step(self, time_step, is_evaluation=False):
     Args:
       time_step: an instance of rl_environment.TimeStep.
       is_evaluation: bool, whether this is a training or evaluation call.
+          Defaults to False.
 
     Returns:
       A `rl_agent.StepOutput` containing the action probs and chosen action.