constantinpape · constantinpape · Dec 8, 2023 · Dec 8, 2023
diff --git a/torch_em/model/unetr.py b/torch_em/model/unetr.py
@@ -19,6 +19,30 @@
 
 
 class UNETR(nn.Module):
+
+    def _load_encoder_from_checkpoint(self, backbone, encoder, checkpoint):
+
+        if backbone == "sam":
+            # If we have a SAM encoder, then we first try to load the full SAM Model
+            # (using micro_sam) and otherwise fall back on directly loading the encoder state
+            # from the checkpoint
+            try:
+                _, model = get_sam_model(
+                    model_type=encoder,
+                    checkpoint_path=checkpoint,
+                    return_sam=True
+                )
+                encoder_state = model.image_encoder.state_dict()
+            except Exception:
+                # If we have a MAE encoder, then we directly load the encoder state
+                # from the checkpoint.
+                encoder_state = torch.load(checkpoint)
+
+        elif backbone == "mae":
+            encoder_state = torch.load(checkpoint)
+
+        self.encoder.load_state_dict(encoder_state)
+
     def __init__(
         self,
         backbone="sam",
@@ -36,20 +60,9 @@ def __init__(
         self.use_mae_stats = use_mae_stats
 
         print(f"Using {encoder} from {backbone.upper()}")
-
         self.encoder = get_vision_transformer(backbone=backbone, model=encoder)
-
         if encoder_checkpoint_path is not None:
-            if backbone == "sam":
-                _, model = get_sam_model(
-                    model_type=encoder,
-                    checkpoint_path=encoder_checkpoint_path,
-                    return_sam=True
-                )
-                for param1, param2 in zip(model.parameters(), self.encoder.parameters()):
-                    param2.data = param1
-            elif backbone == "mae":
-                raise NotImplementedError
+            self._load_encoder_from_checkpoint(backbone, encoder, encoder_checkpoint_path)
 
         # parameters for the decoder network
         depth = 3

diff --git a/torch_em/model/vit.py b/torch_em/model/vit.py
@@ -39,7 +39,11 @@ def __init__(
                 "and then rerun your code."
             )
 
-        super().__init__(embed_dim=embed_dim, **kwargs)
+        super().__init__(
+            embed_dim=embed_dim,
+            global_attn_indexes=global_attn_indexes,
+            **kwargs,
+        )
         self.chunks_for_projection = global_attn_indexes
         self.in_chans = in_chans
         self.embed_dim = embed_dim