From c7a22e1b4ef3cfa4d2a28acf95323bac0243d99d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:13:20 +0300 Subject: [PATCH 1/5] [Partner Nodes] feat: add Ideogram V4 node (#14261) Signed-off-by: bigcat88 --- comfy_api_nodes/apis/ideogram.py | 16 +++++ comfy_api_nodes/nodes_ideogram.py | 116 ++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/comfy_api_nodes/apis/ideogram.py b/comfy_api_nodes/apis/ideogram.py index 737e18e3b0a6..c5ad9559f52c 100644 --- a/comfy_api_nodes/apis/ideogram.py +++ b/comfy_api_nodes/apis/ideogram.py @@ -290,3 +290,19 @@ class IdeogramV3Request(BaseModel): None, description='Optional masks for character reference images. When provided, must match the number of character_reference_images. Each mask should be a grayscale image of the same dimensions as the corresponding character reference image. The images should be in JPEG, PNG or WebP format.' ) + + +class IdeogramV4Request(BaseModel): + text_prompt: str | None = Field( + None, + description="Natural-language prompt; Magic Prompt is applied automatically. " + "Supply exactly one of text_prompt or json_prompt.", + ) + json_prompt: dict[str, Any] | None = Field( + None, + description="Structured V4 prompt object consumed directly (disables Magic Prompt). " + "Supply exactly one of text_prompt or json_prompt.", + ) + resolution: str | None = Field(None, description="Output resolution in WIDTHxHEIGHT (e.g. '2048x2048').") + rendering_speed: str | None = Field(None, description="Rendering speed: 'TURBO', 'DEFAULT', or 'QUALITY'.") + enable_copyright_detection: bool | None = Field(None, description="Opt into post-generation copyright detection.") diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index 8018c3902b6c..3b914a850cd5 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -10,6 +10,7 @@ ImageRequest, IdeogramV3Request, IdeogramV3EditRequest, + IdeogramV4Request, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -17,6 +18,7 @@ download_url_as_bytesio, resize_mask_to_image, sync_op, + validate_string, ) V1_V1_RES_MAP = { @@ -798,6 +800,119 @@ async def execute( return IO.NodeOutput(await download_and_process_images(image_urls)) +class IdeogramV4(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="IdeogramV4", + display_name="Ideogram V4", + category="partner/image/Ideogram", + description="Generates images using the Ideogram 4.0 model from a text prompt.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the image generation.", + ), + IO.Combo.Input( + "resolution", + options=[ + "Auto", + "2048x2048 (1:1)", + "1440x2880 (1:2)", + "2880x1440 (2:1)", + "1664x2496 (2:3)", + "2496x1664 (3:2)", + "1792x2240 (4:5)", + "2240x1792 (5:4)", + "1440x2560 (9:16)", + "2560x1440 (16:9)", + "1600x2560 (5:8)", + "2560x1600 (8:5)", + "1728x2304 (3:4)", + "2304x1728 (4:3)", + "1296x3168 (9:22)", + "3168x1296 (22:9)", + "1152x2944 (9:23)", + "2944x1152 (23:9)", + "1248x3328 (3:8)", + "3328x1248 (8:3)", + "1280x3072 (5:12)", + "3072x1280 (12:5)", + ], + default="Auto", + ), + IO.Combo.Input( + "rendering_speed", + options=["DEFAULT", "TURBO", "QUALITY"], + default="DEFAULT", + tooltip="Controls the trade-off between generation speed and quality.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=IO.NumberDisplay.number, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["rendering_speed"]), + expr=""" + ( + $speed := widgets.rendering_speed; + $price := + $contains($speed,"turbo") ? 0.0429 : + $contains($speed,"quality") ? 0.143 : + 0.0858; + {"type":"usd","usd": $price} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + resolution: str, + rendering_speed: str, + seed: int, + ): + validate_string(prompt, strip_whitespace=True, min_length=1) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/ideogram/ideogram-v4/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=IdeogramV4Request( + text_prompt=prompt, + resolution=resolution.split(" ")[0] if resolution != "Auto" else None, + rendering_speed=rendering_speed, + ), + max_retries=1, + ) + + if not response.data or len(response.data) == 0: + raise Exception("No images were generated in the response") + image_urls = [image_data.url for image_data in response.data if image_data.url] + if not image_urls: + raise Exception("No image URLs were generated in the response") + return IO.NodeOutput(await download_and_process_images(image_urls)) + + class IdeogramExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -805,6 +920,7 @@ async def get_node_list(self) -> list[type[IO.ComfyNode]]: IdeogramV1, IdeogramV2, IdeogramV3, + IdeogramV4, ] From 24f9a020ce0c0f6966fcb79e5580afbee5706904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:41:44 +0300 Subject: [PATCH 2/5] Support Ideogram4 (#14259) --- comfy/ldm/ideogram4/model.py | 297 +++++++++++++++++++++++++++ comfy/model_base.py | 16 ++ comfy/model_detection.py | 7 + comfy/sd.py | 10 +- comfy/supported_models.py | 40 ++++ comfy/text_encoders/ideogram4.py | 77 +++++++ comfy_extras/nodes_custom_sampler.py | 122 +++++++++++ comfy_extras/nodes_ideogram4.py | 64 ++++++ nodes.py | 3 +- 9 files changed, 633 insertions(+), 3 deletions(-) create mode 100644 comfy/ldm/ideogram4/model.py create mode 100644 comfy/text_encoders/ideogram4.py create mode 100644 comfy_extras/nodes_ideogram4.py diff --git a/comfy/ldm/ideogram4/model.py b/comfy/ldm/ideogram4/model.py new file mode 100644 index 000000000000..3b02a243a966 --- /dev/null +++ b/comfy/ldm/ideogram4/model.py @@ -0,0 +1,297 @@ +""" +The Ideogram 4 transformer is a NextDiT/Lumina2-family single-stream model +consumes Qwen3-VL hidden-state features (concatenated from 13 layers -> 53248 dims) +packs ``[text tokens, image tokens]`` into one sequence with block-diagonal segment attention and 3D interleaved MRoPE. +""" + +from __future__ import annotations + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.patcher_extension +from comfy.ldm.lumina.model import FeedForward +from comfy.ldm.modules.attention import optimized_attention_masked +from comfy.text_encoders.llama import apply_rope, precompute_freqs_cis + +# Per-token role indicators +SEQUENCE_PADDING_INDICATOR = -1 +OUTPUT_IMAGE_INDICATOR = 2 +LLM_TOKEN_INDICATOR = 3 +# Image grid coordinates are offset so they never collide with text positions +IMAGE_POSITION_OFFSET = 65536 + + +class Ideogram4Attention(nn.Module): + def __init__(self, hidden_size, num_heads, eps=1e-5, dtype=None, device=None, operations=None): + super().__init__() + self.num_heads = num_heads + self.head_dim = hidden_size // num_heads + self.hidden_size = hidden_size + + self.qkv = operations.Linear(hidden_size, hidden_size * 3, bias=False, dtype=dtype, device=device) + self.norm_q = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device) + self.norm_k = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device) + self.o = operations.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device) + + def forward(self, x, attn_mask, freqs_cis, transformer_options={}): + batch_size, seq_len, _ = x.shape + qkv = self.qkv(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim) + q, k, v = qkv.unbind(dim=2) + + q = self.norm_q(q) + k = self.norm_k(k) + + # (B, heads, L, head_dim) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + q, k = apply_rope(q, k, freqs_cis) + + out = optimized_attention_masked(q, k, v, self.num_heads, attn_mask, skip_reshape=True, transformer_options=transformer_options) + return self.o(out) + + +class Ideogram4TransformerBlock(nn.Module): + def __init__(self, hidden_size, intermediate_size, num_heads, norm_eps, adaln_dim, dtype=None, device=None, operations=None): + super().__init__() + self.attention = Ideogram4Attention(hidden_size, num_heads, eps=1e-5, dtype=dtype, device=device, operations=operations) + self.feed_forward = FeedForward( + dim=hidden_size, hidden_dim=intermediate_size, multiple_of=1, ffn_dim_multiplier=None, + operation_settings={"operations": operations, "dtype": dtype, "device": device}, + ) + + self.attention_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device) + self.ffn_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device) + self.attention_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device) + self.ffn_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device) + + self.adaln_modulation = operations.Linear(adaln_dim, 4 * hidden_size, bias=True, dtype=dtype, device=device) + + def forward(self, x, attn_mask, freqs_cis, adaln_input, transformer_options={}): + mod = self.adaln_modulation(adaln_input) + scale_msa, gate_msa, scale_mlp, gate_mlp = mod.chunk(4, dim=-1) + gate_msa = torch.tanh(gate_msa) + gate_mlp = torch.tanh(gate_mlp) + scale_msa = 1.0 + scale_msa + scale_mlp = 1.0 + scale_mlp + + attn_out = self.attention(self.attention_norm1(x) * scale_msa, attn_mask, freqs_cis, transformer_options=transformer_options) + x = x + gate_msa * self.attention_norm2(attn_out) + x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp)) + return x + + +def _sinusoidal_embedding(t, dim, scale=1e4): + t = t.to(torch.float32) + half = dim // 2 + freq = math.log(scale) / (half - 1) + freq = torch.exp(torch.arange(half, dtype=torch.float32, device=t.device) * -freq) + emb = t.unsqueeze(-1) * freq + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) + if dim % 2 == 1: + emb = F.pad(emb, (0, 1)) + return emb + + +class Ideogram4EmbedScalar(nn.Module): + def __init__(self, dim, input_range=(0.0, 1.0), dtype=None, device=None, operations=None): + super().__init__() + self.dim = dim + self.range_min, self.range_max = input_range + self.mlp_in = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device) + self.mlp_out = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device) + + def forward(self, x): + x = x.to(torch.float32) + scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min) + emb = _sinusoidal_embedding(scaled, self.dim) + emb = emb.to(self.mlp_in.weight.dtype) + emb = F.silu(self.mlp_in(emb)) + return self.mlp_out(emb) + + +class Ideogram4FinalLayer(nn.Module): + def __init__(self, hidden_size, out_channels, adaln_dim, dtype=None, device=None, operations=None): + super().__init__() + self.norm_final = operations.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False, dtype=dtype, device=device) + self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device) + self.adaln_modulation = operations.Linear(adaln_dim, hidden_size, bias=True, dtype=dtype, device=device) + + def forward(self, x, c): + scale = 1.0 + self.adaln_modulation(F.silu(c)) + return self.linear(self.norm_final(x) * scale) + + +class Ideogram4Transformer(nn.Module): + """A single Ideogram 4 backbone operating on a packed token sequence.""" + + def __init__(self, emb_dim, num_layers, num_heads, intermediate_size, adaln_dim, + in_channels, llm_features_dim, rope_theta, mrope_section, norm_eps, + dtype=None, device=None, operations=None): + super().__init__() + self.head_dim = emb_dim // num_heads + self.rope_theta = rope_theta + self.mrope_section = tuple(mrope_section) + + self.input_proj = operations.Linear(in_channels, emb_dim, bias=True, dtype=dtype, device=device) + self.llm_cond_norm = operations.RMSNorm(llm_features_dim, eps=1e-6, elementwise_affine=True, dtype=dtype, device=device) + self.llm_cond_proj = operations.Linear(llm_features_dim, emb_dim, bias=True, dtype=dtype, device=device) + self.t_embedding = Ideogram4EmbedScalar(emb_dim, input_range=(0.0, 1.0), dtype=dtype, device=device, operations=operations) + self.adaln_proj = operations.Linear(emb_dim, adaln_dim, bias=True, dtype=dtype, device=device) + + self.embed_image_indicator = operations.Embedding(2, emb_dim, dtype=dtype, device=device) + + self.layers = nn.ModuleList([ + Ideogram4TransformerBlock(emb_dim, intermediate_size, num_heads, norm_eps, adaln_dim, + dtype=dtype, device=device, operations=operations) + for _ in range(num_layers) + ]) + + self.final_layer = Ideogram4FinalLayer(emb_dim, in_channels, adaln_dim, dtype=dtype, device=device, operations=operations) + + def _backbone(self, llm_features, x, t, position_ids, attn_mask, indicator, transformer_options={}): + indicator = indicator.to(torch.long) + output_image_mask = (indicator == OUTPUT_IMAGE_INDICATOR).to(x.dtype).unsqueeze(-1) + + x = x * output_image_mask + h = self.input_proj(x) * output_image_mask + + t_cond = self.t_embedding(t) + if t.dim() == 1: + t_cond = t_cond.unsqueeze(1) + adaln_input = F.silu(self.adaln_proj(t_cond)) + + # h is zero on the text rows (content lives only on image rows), add writes the text features in place + if llm_features is not None: + L_text = llm_features.shape[1] + text_mask = (indicator[:, :L_text] == LLM_TOKEN_INDICATOR).to(x.dtype).unsqueeze(-1) + llm = self.llm_cond_norm(llm_features * text_mask) + llm = self.llm_cond_proj(llm) * text_mask + h[:, :L_text] = h[:, :L_text] + llm + + h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long)) + + # Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch). + freqs_cis = precompute_freqs_cis( + self.head_dim, position_ids[0].transpose(0, 1), self.rope_theta, + rope_dims=self.mrope_section, interleaved_mrope=True, device=position_ids.device, + ) + + if attn_mask is not None and attn_mask.dtype == torch.bool: + attn_mask = torch.zeros_like(attn_mask, dtype=h.dtype).masked_fill_(~attn_mask, -torch.finfo(h.dtype).max) + + for layer in self.layers: + h = layer(h, attn_mask, freqs_cis, adaln_input, transformer_options=transformer_options) + + return self.final_layer(h, adaln_input) + + +class Ideogram4Transformer2DModel(Ideogram4Transformer): + """Ideogram 4 single-stream DiT. + + Runs a packed ``[text, image]`` sequence when text context is supplied, or an image-only sequence when ``context is None``. + """ + + def __init__(self, image_model=None, in_channels=128, num_layers=34, num_attention_heads=18, attention_head_dim=256, intermediate_size=12288, + adaln_dim=512, llm_features_dim=53248, rope_theta=5000000, mrope_section=(24, 20, 20), norm_eps=1e-5, + dtype=None, device=None, operations=None, **kwargs): + emb_dim = num_attention_heads * attention_head_dim + super().__init__( + emb_dim=emb_dim, num_layers=num_layers, num_heads=num_attention_heads, + intermediate_size=intermediate_size, adaln_dim=adaln_dim, in_channels=in_channels, + llm_features_dim=llm_features_dim, rope_theta=rope_theta, mrope_section=mrope_section, + norm_eps=norm_eps, dtype=dtype, device=device, operations=operations) + self.dtype = dtype + self.in_channels = in_channels + self.out_channels = in_channels + # 128-dim token = patch (2x2) * ae_channels (32). + self.patch_size = 2 + self.ae_channels = in_channels // (self.patch_size * self.patch_size) + + def _img_to_tokens(self, x): + B, C, gh, gw = x.shape + x = x.view(B, self.ae_channels, self.patch_size, self.patch_size, gh, gw) + x = x.permute(0, 4, 5, 2, 3, 1) # (B, gh, gw, pi, pj, c) + return x.reshape(B, gh * gw, C) + + def _tokens_to_img(self, tokens, gh, gw): + B = tokens.shape[0] + C = tokens.shape[-1] + x = tokens.reshape(B, gh, gw, self.patch_size, self.patch_size, self.ae_channels) + x = x.permute(0, 5, 3, 4, 1, 2) # (B, c, pi, pj, gh, gw) + return x.reshape(B, C, gh, gw) + + def _image_position_ids(self, gh, gw, device): + h_idx = torch.arange(gh, device=device).view(-1, 1).expand(gh, gw).reshape(-1) + w_idx = torch.arange(gw, device=device).view(1, -1).expand(gh, gw).reshape(-1) + t_idx = torch.zeros_like(h_idx) + return torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET # (L_img, 3) + + def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options): + B = x_chunk.shape[0] + device = x_chunk.device + img_tokens = self._img_to_tokens(x_chunk).to(self.dtype) + L_img = img_tokens.shape[1] + L_text = context_chunk.shape[1] + L = L_text + L_img + latent_dim = img_tokens.shape[-1] + + x_full = torch.zeros(B, L, latent_dim, dtype=img_tokens.dtype, device=device) + x_full[:, L_text:] = img_tokens + + text_pos = torch.arange(L_text, device=device).view(-1, 1).expand(L_text, 3) + img_pos = self._image_position_ids(gh, gw, device) + position_ids = torch.cat([text_pos, img_pos], dim=0).unsqueeze(0).expand(B, L, 3) + + indicator = torch.empty(B, L, dtype=torch.long, device=device) + indicator[:, :L_text] = LLM_TOKEN_INDICATOR + indicator[:, L_text:] = OUTPUT_IMAGE_INDICATOR + + attn_mask = None + if attn_mask_chunk is not None: + segment_ids = torch.ones(B, L, dtype=torch.long, device=device) + pad = (attn_mask_chunk == 0) + segment_ids[:, :L_text][pad] = SEQUENCE_PADDING_INDICATOR + indicator[:, :L_text][pad] = 0 + # Block-diagonal mask from segment ids: (B, 1, L, L), True = attend. + attn_mask = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).unsqueeze(1) + + out = self._backbone(context_chunk, x_full, t_chunk, position_ids, attn_mask, indicator, + transformer_options=transformer_options) + return self._tokens_to_img(out[:, L_text:], gh, gw) + + def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options): + B = x_chunk.shape[0] + device = x_chunk.device + img_tokens = self._img_to_tokens(x_chunk).to(self.dtype) + L_img = img_tokens.shape[1] + + position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3) + indicator = torch.full((B, L_img), OUTPUT_IMAGE_INDICATOR, dtype=torch.long, device=device) + + # Image-only sequence is a single segment -> no mask, full attention, no LLM context. + out = self._backbone(None, img_tokens, t_chunk, position_ids, None, indicator, transformer_options=transformer_options) + return self._tokens_to_img(out, gh, gw) + + def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options), + ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs) + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + bs, c, gh, gw = x.shape + + timesteps = 1.0 - timesteps + + # unconditional pass + if context is None: + return -self._run_image_only(x, timesteps, gh, gw, transformer_options) + + return -self._run_conditional(x, context, attention_mask, timesteps, gh, gw, transformer_options) diff --git a/comfy/model_base.py b/comfy/model_base.py index 3e2d4e930130..042804771890 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -55,6 +55,7 @@ import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 import comfy.ldm.qwen_image.model +import comfy.ldm.ideogram4.model import comfy.ldm.kandinsky5.model import comfy.ldm.anima.model import comfy.ldm.ace.ace_step15 @@ -2018,6 +2019,21 @@ def extra_conds_shapes(self, **kwargs): out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) return out +class Ideogram4(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ideogram4.model.Ideogram4Transformer2DModel) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + if torch.numel(attention_mask) != attention_mask.sum(): + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + return out + class HunyuanImage21(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 24e742a7f5c3..74c838d13338 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -815,6 +815,13 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["default_ref_method"] = "negative_index" return dit_config + if '{}embed_image_indicator.weight'.format(key_prefix) in state_dict_keys: # Ideogram 4 + dit_config = {} + dit_config["image_model"] = "ideogram4" + dit_config["in_channels"] = state_dict['{}input_proj.weight'.format(key_prefix)].shape[1] + dit_config["num_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.') + return dit_config + if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5 dit_config = {} model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0] diff --git a/comfy/sd.py b/comfy/sd.py index 9a2d31930f0b..a66ba1bfb76e 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -58,6 +58,7 @@ import comfy.text_encoders.qwen_image import comfy.text_encoders.hunyuan_image import comfy.text_encoders.z_image +import comfy.text_encoders.ideogram4 import comfy.text_encoders.ovis import comfy.text_encoders.kandinsky5 import comfy.text_encoders.jina_clip_2 @@ -1298,6 +1299,7 @@ class CLIPType(Enum): COGVIDEOX = 27 LENS = 28 PIXELDIT = 29 + IDEOGRAM4 = 30 @@ -1596,8 +1598,12 @@ class EmptyClass: clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer elif te_model == TEModel.QWEN3_8B: - clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_8b") - clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B + if clip_type == CLIPType.IDEOGRAM4: + clip_target.clip = comfy.text_encoders.ideogram4.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Tokenizer + else: + clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_8b") + clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B elif te_model == TEModel.JINA_CLIP_2: clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 0872b0e276e2..478489ed891b 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -24,6 +24,7 @@ import comfy.text_encoders.hunyuan_image import comfy.text_encoders.kandinsky5 import comfy.text_encoders.z_image +import comfy.text_encoders.ideogram4 import comfy.text_encoders.anima import comfy.text_encoders.ace15 import comfy.text_encoders.longcat_image @@ -1746,6 +1747,44 @@ def clip_target(self, state_dict={}): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) +class Ideogram4(supported_models_base.BASE): + unet_config = { + "image_model": "ideogram4", + } + + sampling_settings = { + "multiplier": 1.0, + "shift": 1.0, + } + + memory_usage_factor = 1.8 # TODO + + unet_extra_config = { + "num_attention_heads": 18, + "attention_head_dim": 256, + "intermediate_size": 12288, + "adaln_dim": 512, + "llm_features_dim": 53248, + "rope_theta": 5000000, + "mrope_section": [24, 20, 20], + "norm_eps": 1e-5, + } + latent_format = latent_formats.Flux2 + + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Ideogram4(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.ideogram4.Ideogram4Tokenizer, comfy.text_encoders.ideogram4.te(**hunyuan_detect)) + class QwenImage(supported_models_base.BASE): unet_config = { "image_model": "qwen_image", @@ -2233,6 +2272,7 @@ def get_model(self, state_dict, prefix="", device=None): ACEStep15, Omnigen2, QwenImage, + Ideogram4, Flux2, Lens, Kandinsky5Image, diff --git a/comfy/text_encoders/ideogram4.py b/comfy/text_encoders/ideogram4.py new file mode 100644 index 000000000000..55e655d67a2f --- /dev/null +++ b/comfy/text_encoders/ideogram4.py @@ -0,0 +1,77 @@ +"""Ideogram 4 text encoder: Qwen3-VL-8B language model, 13-layer tap. + +Ideogram 4 conditions on the concatenation of hidden states from 13 layers of +Qwen3-VL (layers 0,3,...,33,35), giving a 4096*13 = 53248-dim feature per token. +""" + +import os + +from transformers import Qwen2Tokenizer + +import comfy.text_encoders.llama +from comfy import sd1_clip + +# Reference taps outputs of layers (0,3,...,35); comfy captures layer inputs, offset by +1. +IDEOGRAM4_TAP_LAYERS = [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 36] + + +class Qwen3VLTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer") + super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, + embedding_size=4096, embedding_key='qwen3vl_8b', tokenizer_class=Qwen2Tokenizer, + has_start_token=False, has_end_token=False, pad_to_max_length=False, + max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) + + +class Ideogram4Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, + name="qwen3vl_8b", tokenizer=Qwen3VLTokenizer) + + self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs): + if llama_template is None: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) + + +# Qwen3-VL-8B = 5e6 (vs plain Qwen3-8B's 1e6) +# final_norm/lm_head off -> Ideogram only reads raw tapped hidden states +QWEN3VL_8B_CONFIG = {"rope_theta": 5000000.0, "final_norm": False, "lm_head": False} + + +class Qwen3VL8BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=IDEOGRAM4_TAP_LAYERS, layer_idx=None, + textmodel_json_config=dict(QWEN3VL_8B_CONFIG), + dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, + model_class=comfy.text_encoders.llama.Qwen3_8B, + enable_attention_masks=attention_mask, return_attention_masks=attention_mask, + model_options=model_options) + + +class Ideogram4TEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=Qwen3VL8BModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + out, pooled, extra = super().encode_token_weights(token_weight_pairs) + b, n, seq, h = out.shape # (B, n_taps=13, seq, 4096) stacked in ascending layer order. + out = out.permute(0, 2, 3, 1).reshape(b, seq, h * n) # (B, seq, 4096*13). permute -> (B, seq, H, taps). + return out, pooled, extra + + +def te(dtype_llama=None, llama_quantization_metadata=None): + class Ideogram4TEModel_(Ideogram4TEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if dtype_llama is not None: + dtype = dtype_llama + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["quantization_metadata"] = llama_quantization_metadata + super().__init__(device=device, dtype=dtype, model_options=model_options) + return Ideogram4TEModel_ diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index c3346bf09b72..b790d7aacdbb 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -1,5 +1,7 @@ import math import comfy.samplers +import comfy.sampler_helpers +import comfy.patcher_extension import comfy.sample from comfy.k_diffusion import sampling as k_diffusion_sampling from comfy.k_diffusion import sa_solver @@ -894,6 +896,83 @@ def execute(cls, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, s get_guider = execute +class Guider_DualModel(comfy.samplers.CFGGuider): + # Runs the positive (cond) pass on the main model and the negative (uncond) pass on a separate model + def __init__(self, model_patcher, uncond_model_patcher): + super().__init__(model_patcher) + self.uncond_model_patcher = uncond_model_patcher + self.uncond_inner = None + + def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None, latent_shapes=None): + self.uncond_inner = None + self.uncond_loaded = [] + self._uncond_neg = None + # skip at cfg 1.0 + if not math.isclose(self.cfg, 1.0): + uc = {"negative": list(map(lambda a: a.copy(), self.conds["negative"]))} + self.uncond_inner, uc, self.uncond_loaded = comfy.sampler_helpers.prepare_sampling( + self.uncond_model_patcher, noise.shape, uc, self.uncond_model_patcher.model_options) + self._uncond_neg = uc["negative"] + self.uncond_model_patcher.pre_run() + try: + return super().outer_sample(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) + finally: + if self.uncond_inner is not None: + self.uncond_model_patcher.cleanup() + comfy.sampler_helpers.cleanup_models({"negative": self._uncond_neg}, self.uncond_loaded) + self.uncond_inner = None + + def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=None): + if self.uncond_inner is not None: + li = latent_image + if li is not None and torch.count_nonzero(li) > 0: + li = self.uncond_inner.process_latent_in(li) + self._uncond_conds = comfy.samplers.process_conds( + self.uncond_inner, noise, {"negative": self._uncond_neg}, device, li, denoise_mask, seed, latent_shapes=latent_shapes)["negative"] + return super().inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) + + def predict_noise(self, x, timestep, model_options={}, seed=None): + positive = self.conds.get("positive", None) + if self.uncond_inner is None: # cfg == 1 or no negative -> single model, cond only + return comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0] + cond = comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0] + + uncond_model_options = model_options + if "multigpu_clones" in model_options: # TODO: support multigpu instead of just running uncond on a single GPU + uncond_model_options = {k: v for k, v in model_options.items() if k != "multigpu_clones"} + uncond = comfy.samplers.calc_cond_batch(self.uncond_inner, [self._uncond_conds], x, timestep, uncond_model_options)[0] + return comfy.samplers.cfg_function(self.inner_model, cond, uncond, self.cfg, x, timestep, + model_options=model_options, cond=positive, uncond=self._uncond_conds) + +class DualModelGuider(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="DualModelGuider", + display_name="Dual Model CFG Guider", + category="model/sampling/guiders", + inputs=[ + io.Model.Input("model", tooltip="Model used for the positive (conditional) pass."), + io.Model.Input("model_negative", optional=True, tooltip="Model used for the negative (unconditional) pass. Use the same model for ordinary CFG."), + io.Conditioning.Input("positive"), + io.Float.Input("cfg", default=4.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Conditioning.Input("negative", optional=True, tooltip="Negative conditioning run on the negative model. Leave unconnected for a text-free (image-only) unconditional pass."), + ], + outputs=[io.Guider.Output()], + ) + + @classmethod + def execute(cls, model, positive, cfg, model_negative=None, negative=None) -> io.NodeOutput: + if negative is None: + negative = [[None, {}]] # null cond -> no cross_attn -> model runs image-only + + guider = Guider_DualModel(model, model_negative) if model_negative is not None else comfy.samplers.CFGGuider(model) + guider.set_conds(positive, negative) + guider.set_cfg(cfg) + return io.NodeOutput(guider) + + get_guider = execute + class DisableNoise(io.ComfyNode): @classmethod def define_schema(cls): @@ -1054,11 +1133,53 @@ def execute(cls, sigmas) -> io.NodeOutput: sigmas = torch.FloatTensor(sigmas) return io.NodeOutput(sigmas) +class CFGOverride(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CFGOverride", + display_name="CFG Override", + description="Override cfg to a fixed value over a [start, end] percent slice of the steps. " + "With multiple overrides, the one nearest the sampler wins on overlap.", + category="sampling/custom_sampling", + inputs=[ + io.Model.Input("model"), + io.Float.Input("cfg", default=1.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[io.Model.Output()], + ) + + @classmethod + def execute(cls, model, cfg, start_percent, end_percent) -> io.NodeOutput: + ms = model.get_model_object("model_sampling") + sigma_hi = ms.percent_to_sigma(start_percent) # percent->sigma decreasing, so hi >= lo + sigma_lo = ms.percent_to_sigma(end_percent) + + def predict_noise_wrapper(executor, *args, **kwargs): + sigma = float(args[1].flatten()[0]) # args = (x, timestep, model_options, seed) + if not (sigma_lo <= sigma <= sigma_hi): + return executor(*args, **kwargs) + guider = executor.class_obj # guider.cfg feeds cond_scale + saved = guider.cfg + guider.cfg = cfg + try: + return executor(*args, **kwargs) + finally: + guider.cfg = saved # restore for other steps/overrides + + m = model.clone() + m.add_wrapper(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, predict_noise_wrapper) + return io.NodeOutput(m) + + class CustomSamplersExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: return [ SamplerCustom, + CFGOverride, BasicScheduler, KarrasScheduler, ExponentialScheduler, @@ -1087,6 +1208,7 @@ async def get_node_list(self) -> list[type[io.ComfyNode]]: SamplingPercentToSigma, CFGGuider, DualCFGGuider, + DualModelGuider, BasicGuider, RandomNoise, DisableNoise, diff --git a/comfy_extras/nodes_ideogram4.py b/comfy_extras/nodes_ideogram4.py new file mode 100644 index 000000000000..d5827db4f260 --- /dev/null +++ b/comfy_extras/nodes_ideogram4.py @@ -0,0 +1,64 @@ +"""Ideogram 4 sampling helper +""" + +import math + +import torch +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + +_LOGSNR_MIN = -15.0 +_LOGSNR_MAX = 18.0 + + +def _logit_normal_schedule(u, mean, std): + # Reference time (0=noise..1=clean) via the probit/ndtri quantile. + u = torch.as_tensor(u, dtype=torch.float64) + t = 1.0 - torch.special.expit(mean + std * torch.special.ndtri(u)) + t_min = 1.0 / (1.0 + math.exp(0.5 * _LOGSNR_MAX)) + t_max = 1.0 / (1.0 + math.exp(0.5 * _LOGSNR_MIN)) + return t.clamp(t_min, t_max) + + +def ideogram4_sigmas(num_steps, width, height, mu, std): + """Descending sigmas (len num_steps+1) for the reference schedule. + + mu + the resolution term form the logSNR shift; std is the spread. + """ + mean = mu + 0.5 * math.log((width * height) / (512 * 512)) + u = torch.linspace(0.0, 1.0, num_steps + 1, dtype=torch.float64) + sigmas = (1.0 - _logit_normal_schedule(u, mean, std)).flip(0) + sigmas[-1] = 0.0 # clamp leaves ~6e-4; force full denoise + return sigmas.to(torch.float32) + + +class Ideogram4Scheduler(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="Ideogram4Scheduler", + display_name="Ideogram 4 Scheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=200), + io.Int.Input("width", default=1024, min=256, max=8192, step=16), + io.Int.Input("height", default=1024, min=256, max=8192, step=16), + io.Float.Input("mu", default=0.0, min=-10.0, max=10.0, step=0.05), + io.Float.Input("std", default=1.75, min=0.1, max=5.0, step=0.05), + ], + outputs=[io.Sigmas.Output()], + ) + + @classmethod + def execute(cls, steps, width, height, mu, std) -> io.NodeOutput: + return io.NodeOutput(ideogram4_sigmas(steps, width, height, mu, std)) + + +class Ideogram4Extension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [Ideogram4Scheduler] + + +async def comfy_entrypoint() -> Ideogram4Extension: + return Ideogram4Extension() diff --git a/nodes.py b/nodes.py index 331425b8724e..2f5a478b59e3 100644 --- a/nodes.py +++ b/nodes.py @@ -969,7 +969,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -2362,6 +2362,7 @@ async def init_builtin_extra_nodes(): "nodes_model_downscale.py", "nodes_images.py", "nodes_video_model.py", + "nodes_ideogram4.py", "nodes_train.py", "nodes_dataset.py", "nodes_sag.py", From f69225df245991dd0b1e212737805f48791d5cc3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Jun 2026 08:55:18 -0700 Subject: [PATCH 3/5] Mark DualModelGuider as experimental (#14262) --- comfy_extras/nodes_custom_sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index b790d7aacdbb..2f4ff1f70708 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -951,6 +951,7 @@ def define_schema(cls): node_id="DualModelGuider", display_name="Dual Model CFG Guider", category="model/sampling/guiders", + is_experimental=True, inputs=[ io.Model.Input("model", tooltip="Model used for the positive (conditional) pass."), io.Model.Input("model_negative", optional=True, tooltip="Model used for the negative (unconditional) pass. Use the same model for ordinary CFG."), From f0619af65927bc0c42e4b35635c7fc8158566b43 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Thu, 4 Jun 2026 00:10:26 +0800 Subject: [PATCH 4/5] chore: update workflow templates to v0.9.94 (#14263) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7dff9e3c3999..79d38fc066fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.44.19 -comfyui-workflow-templates==0.9.92 +comfyui-workflow-templates==0.9.94 comfyui-embedded-docs==0.5.2 torch torchsde From 8e3045a90b4bed502ee06a9dd3032805579cccb9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Jun 2026 09:19:18 -0700 Subject: [PATCH 5/5] Memory usage factor for ideogram 4 on non dynamic vram. (#14264) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 478489ed891b..7cf9c133b9cb 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1757,7 +1757,7 @@ class Ideogram4(supported_models_base.BASE): "shift": 1.0, } - memory_usage_factor = 1.8 # TODO + memory_usage_factor = 11.6 unet_extra_config = { "num_attention_heads": 18,