facebookresearch · Aladoro · Sep 7, 2021 · Sep 7, 2021 · Jan 15, 2022
diff --git a/drqv2.py b/drqv2.py
@@ -100,25 +100,21 @@ def __init__(self, repr_dim, action_shape, feature_dim, hidden_dim):
         self.trunk = nn.Sequential(nn.Linear(repr_dim, feature_dim),
                                    nn.LayerNorm(feature_dim), nn.Tanh())
 
-        self.Q1 = nn.Sequential(
-            nn.Linear(feature_dim + action_shape[0], hidden_dim),
-            nn.ReLU(inplace=True), nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(inplace=True), nn.Linear(hidden_dim, 1))
-
-        self.Q2 = nn.Sequential(
-            nn.Linear(feature_dim + action_shape[0], hidden_dim),
-            nn.ReLU(inplace=True), nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(inplace=True), nn.Linear(hidden_dim, 1))
+        self.QS = nn.Sequential(
+            utils.DenseParallel(feature_dim + action_shape[0], hidden_dim, 2),
+            nn.ReLU(inplace=True),
+            utils.DenseParallel(hidden_dim, hidden_dim, 2),
+            nn.ReLU(inplace=True),
+            utils.DenseParallel(hidden_dim, 1, 2))
 
         self.apply(utils.weight_init)
 
     def forward(self, obs, action):
         h = self.trunk(obs)
         h_action = torch.cat([h, action], dim=-1)
-        q1 = self.Q1(h_action)
-        q2 = self.Q2(h_action)
+        qs = self.QS(h_action)
 
-        return q1, q2
+        return torch.squeeze(torch.transpose(qs, 0, 1), dim=-1)
 
 
 class DrQV2Agent:
@@ -181,17 +177,17 @@ def update_critic(self, obs, action, reward, discount, next_obs, step):
             stddev = utils.schedule(self.stddev_schedule, step)
             dist = self.actor(next_obs, stddev)
             next_action = dist.sample(clip=self.stddev_clip)
-            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
-            target_V = torch.min(target_Q1, target_Q2)
+            target_QS = self.critic_target(next_obs, next_action)
+            target_V = target_QS.amin(dim=1, keepdim=True)
             target_Q = reward + (discount * target_V)
 
-        Q1, Q2 = self.critic(obs, action)
-        critic_loss = F.mse_loss(Q1, target_Q) + F.mse_loss(Q2, target_Q)
+        QS = self.critic(obs, action)
+        critic_loss = (QS - target_Q).square().sum(1).mean()
 
         if self.use_tb:
             metrics['critic_target_q'] = target_Q.mean().item()
-            metrics['critic_q1'] = Q1.mean().item()
-            metrics['critic_q2'] = Q2.mean().item()
+            metrics['critic_q1'] = QS[..., 0].mean().item()
+            metrics['critic_q2'] = QS[..., 1].mean().item()
             metrics['critic_loss'] = critic_loss.item()
 
         # optimize encoder and critic
@@ -210,8 +206,8 @@ def update_actor(self, obs, step):
         dist = self.actor(obs, stddev)
         action = dist.sample(clip=self.stddev_clip)
         log_prob = dist.log_prob(action).sum(-1, keepdim=True)
-        Q1, Q2 = self.critic(obs, action)
-        Q = torch.min(Q1, Q2)
+        QS = self.critic(obs, action)
+        Q = QS.amin(dim=1)
 
         actor_loss = -Q.mean()
 

diff --git a/utils.py b/utils.py
@@ -49,6 +49,70 @@ def to_torch(xs, device):
     return tuple(torch.as_tensor(x, device=device) for x in xs)
 
 
+class DenseParallel(nn.Module):
+    def __init__(self, in_features: int, out_features: int, n_parallel: int,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(DenseParallel, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.n_parallel = n_parallel
+        self.weight = nn.Parameter(torch.empty((n_parallel, in_features, out_features), **factory_kwargs))
+        if bias:
+            self.bias = nn.Parameter(torch.empty((n_parallel, 1, out_features), **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        nn.init.kaiming_uniform_(self.weight, a=np.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / np.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        return torch.matmul(input, self.weight) + self.bias
+
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, n_parallel={}, bias={}'.format(
+            self.in_features, self.out_features, self.n_parallel, self.bias is not None
+        )
+
+
+def parallel_orthogonal_(tensor, gain=1):
+    if tensor.ndimension() < 3:
+        raise ValueError("Only tensors with 3 or more dimensions are supported")
+
+    n_parallel = tensor.size(0)
+    rows = tensor.size(1)
+    cols = tensor.numel() // n_parallel // rows
+    flattened = tensor.new(n_parallel, rows, cols).normal_(0, 1)
+
+    qs = []
+    for flat_tensor in torch.unbind(flattened, dim=0):
+        if rows < cols:
+            flat_tensor.t_()
+
+        # Compute the qr factorization
+        q, r = torch.linalg.qr(flat_tensor)
+        # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+        d = torch.diag(r, 0)
+        ph = d.sign()
+        q *= ph
+
+        if rows < cols:
+            q.t_()
+        qs.append(q)
+
+    qs = torch.stack(qs, dim=0)
+
+    with torch.no_grad():
+        tensor.view_as(qs).copy_(qs)
+        tensor.mul_(gain)
+    return tensor
+
+
 def weight_init(m):
     if isinstance(m, nn.Linear):
         nn.init.orthogonal_(m.weight.data)
@@ -59,6 +123,11 @@ def weight_init(m):
         nn.init.orthogonal_(m.weight.data, gain)
         if hasattr(m.bias, 'data'):
             m.bias.data.fill_(0.0)
+    elif isinstance(m, DenseParallel):
+        gain = nn.init.calculate_gain('relu')
+        parallel_orthogonal_(m.weight.data, gain)
+        if hasattr(m.bias, 'data'):
+            m.bias.data.fill_(0.0)
 
 
 class Until: