[feat] moe: annotate expert params (#140)

The expert annotation is used by clip_grads and DDP.
facebookresearch · Oct 16, 2020 · ee88bb1 · ee88bb1
1 parent d99c445
commit ee88bb1
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 0 deletions.
diff --git a/fairscale/nn/moe/moelayer.py b/fairscale/nn/moe/moelayer.py
@@ -60,6 +60,8 @@ def __init__(self, gate: Module, expert: Module, group: Optional[Any] = None) ->
         self.gate = gate
         self.expert = expert
         self.group = group if group is not None else dist.group.WORLD
+        for p in expert.parameters():
+            p.expert = True  # type: ignore
 
     def all_to_all_dispatch(self, dispatch_mask: Tensor, input: Tensor) -> Tensor:
         dispatched_input = torch.einsum("gsec,gsm->egcm", dispatch_mask.float(), input)

diff --git a/tests/nn/moe/test_moelayer.py b/tests/nn/moe/test_moelayer.py
@@ -45,6 +45,17 @@ def test_create(device):
     moe = MOELayer(gate, expert).to(device)
 
 
+@pytest.mark.parametrize("device", devices)
+def test_expert_params(device):
+    model_dim = 8
+    num_experts = 4
+    gate = Top2Gate(model_dim, num_experts)
+    expert = torch.nn.Linear(model_dim, model_dim)
+    moe = MOELayer(gate, expert).to(device)
+    for p in expert.parameters():
+        assert p.expert is True
+
+
 @pytest.mark.mpi
 @pytest.mark.parametrize("device", ["cpu"])
 def test_forward(device):