flash-algo · LoserCheems · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Copilot
diff --git a/flash_dmattn/integrations/flash_dynamic_mask_attention.py b/flash_dmattn/integrations/flash_dynamic_mask_attention.py
@@ -30,10 +30,10 @@ def flash_dynamic_mask_attention_forward(
         query (torch.Tensor): The query tensor of shape (batch_size, num_heads, query_len, head_dim).
         key (torch.Tensor): The key tensor of shape (batch_size, num_kv_heads, key_len, head_dim).
         value (torch.Tensor): The value tensor of shape (batch_size, num_kv_heads, key_len, head_dim).
-        attention_mask (Optional[torch.Tensor]): The attention mask boolean tensor of shape 
-        (batch_size, seq_len) or (batch_size, {num_heads|num_kv_heads|1}, {query_len|0}, key_len).
+        attention_mask (Optional[torch.Tensor]): The attention mask boolean tensor of shape
+        (batch_size, seq_len) or ({batch_size|1}, {num_heads|num_kv_heads|1}, {query_len|1}, {key_len|1}).
         attention_bias (Optional[torch.Tensor]): The attention bias float tensor of shape
-        (batch_size, {num_heads|num_kv_heads|1}, {query_len|0}, key_len).
+        ({batch_size|1}, {num_heads|num_kv_heads|1}, {query_len|1}, {key_len|1}).
         scaling (Optional[float]): The scaling factor for the attention scores.
         window_size (Optional[int]): The size of the window to keep.
         softcap (Optional[float]): The softcap value for the attention scores.

diff --git a/flash_dmattn/integrations/modeling_flash_dynamic_mask_attention_utils.py b/flash_dmattn/integrations/modeling_flash_dynamic_mask_attention_utils.py
@@ -626,11 +626,6 @@ def _flash_dynamic_mask_attention_forward(
         ):
             min_dtype = torch.finfo(query_states.dtype).min
             if attention_mask is not None:
-            if attention_mask is not None:
+            if attention_mask is not None:
+                # Ensure attention_mask is broadcastable to attention_bias
+                if attention_mask.shape != attention_bias.shape:
+                    # Try to unsqueeze or expand attention_mask to match attention_bias
+                    # Common case: attention_mask is (batch, seq_len) and attention_bias is (batch, num_heads, seq_len, seq_len)
+                    # or attention_mask is (batch, 1, seq_len, seq_len)
+                    # We'll try to expand attention_mask to match attention_bias
+                    try:
+                        attention_mask = attention_mask.expand_as(attention_bias)
+                    except RuntimeError:
+                        # Try unsqueeze and expand for common 3D->4D case
+                        while attention_mask.dim() < attention_bias.dim():
+                            attention_mask = attention_mask.unsqueeze(1)
+                        attention_mask = attention_mask.expand_as(attention_bias)
-            if attention_mask is not None:
+            if attention_mask is not None:
+                # Ensure attention_mask is broadcastable to attention_bias
+                if attention_mask.shape != attention_bias.shape:
+                    # Try to unsqueeze or expand attention_mask to match attention_bias
+                    # Common case: attention_mask is (batch, seq_len) and attention_bias is (batch, num_heads, seq_len, seq_len)
+                    # or attention_mask is (batch, 1, seq_len, seq_len)
+                    # We'll try to expand attention_mask to match attention_bias
+                    try:
+                        attention_mask = attention_mask.expand_as(attention_bias)
+                    except RuntimeError:
+                        # Try unsqueeze and expand for common 3D->4D case
+                        while attention_mask.dim() < attention_bias.dim():
+                            attention_mask = attention_mask.unsqueeze(1)
+                        attention_mask = attention_mask.expand_as(attention_bias)
-                if attention_mask.dim() == 4 and attention_bias.dim() == 3:
-                    attention_bias = attention_bias.unsqueeze(-2).expand(-1, -1, query_length, -1)
-                if attention_mask.dim() == 3 and attention_bias.dim() == 4:
-                    attention_mask = attention_mask.unsqueeze(-2).expand(-1, -1, query_length, -1)
-
                 topk_values, topk_indices = torch.topk(
                     attention_bias.masked_fill(~attention_mask, min_dtype).detach(),
                     window_size, dim=-1, largest=True, sorted=False