In [None]:
import torch
from diffusers import FluxPipeline

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
pipe.enable_sequential_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power
# pipe.
prompt = "portrait of abstraction architects"
image = pipe(
    prompt,
    guidance_scale=0.0,
    num_inference_steps=4,
    max_sequence_length=256,width=512, height=512,
    generator=torch.Generator("cpu").manual_seed(0)
).images[0]
image.save("flux-schnell-aa.png")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [1]:
import torch
from diffusers import FluxPipeline

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [2]:
list(pipe.transformer.named_children())

In [16]:
pipe.__dict__.keys()

dict_keys(['_internal_dict', 'vae', 'text_encoder', 'tokenizer', 'unet', 'scheduler', 'safety_checker', 'feature_extractor', 'image_encoder', 'vae_scale_factor', 'image_processor', '_guidance_scale', '_guidance_rescale', '_clip_skip', '_cross_attention_kwargs', '_interrupt', '_num_timesteps', '_progress_bar_config'])

In [41]:
def get_all_modules_name(pipe):
    res = []
    for k in pipe.__dict__:
        if isinstance(getattr(pipe, k), torch.nn.modules.module.Module):
            res.append(k)
    return res

In [42]:
get_all_modules_name(pipe)

['vae', 'text_encoder', 'unet', 'safety_checker']

In [46]:
pipe.image_processor

VaeImageProcessor {
  "_class_name": "VaeImageProcessor",
  "_diffusers_version": "0.30.0",
  "do_binarize": false,
  "do_convert_grayscale": false,
  "do_convert_rgb": false,
  "do_normalize": true,
  "do_resize": true,
  "resample": "lanczos",
  "vae_latent_channels": 4,
  "vae_scale_factor": 8
}

In [1]:
!start

## a

In [None]:
import torch
from torch.ao.quantization import quantize_dynamic
import gc
import numpy as np

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

def check_submodule_sizes(model, parent_name=''):
    sizes = []
    for name, submodule in model.named_children():
        full_name = f"{parent_name}.{name}" if parent_name else name
        size_mb = get_model_size(submodule)
        sizes.append((full_name, size_mb))
        sizes.extend(check_submodule_sizes(submodule, full_name))
    return sizes

def quantize_module(module, dtype=torch.qint8):
    return quantize_dynamic(
        module,
        {torch.nn.Linear},
        dtype=dtype
    )

def quantize_transformer_submodules(transformer, dtype=torch.qint8, size_threshold_mb=100):
    sizes = check_submodule_sizes(transformer)
    sizes.sort(key=lambda x: x[1], reverse=True)
    
    # print("Submodule sizes before quantization:")
    # for name, size in sizes:
    #     print(f"{name}: {size:.2f} MB")
    
    total_size_before = sum(size for _, size in sizes)
    print(f"\nTotal model size before quantization: {total_size_before:.2f} MB")
    
    for name, size in sizes:
        if size > size_threshold_mb:
            print(f"\nQuantizing {name} ({size:.2f} MB)...")
            submodule = transformer
            for part in name.split('.'):
                submodule = getattr(submodule, part)
            setattr(submodule, part, quantize_module(submodule, dtype))
            gc.collect()
            torch.cuda.empty_cache()
    
    sizes_after = check_submodule_sizes(transformer)
    total_size_after = sum(size for _, size in sizes_after)
    print(f"\nTotal model size after quantization: {total_size_after:.2f} MB")
    print(f"Size reduction: {(1 - total_size_after/total_size_before)*100:.2f}%")
    
    return transformer

# Assuming 'pipe' is your pipeline object
# Move to CPU for quantization
pipe.to('cpu')

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

try:
    # Check sizes and quantize transformer submodules
    pipe.transformer = quantize_transformer_submodules(pipe.transformer, size_threshold_mb=100)
    print("Quantization successful")
except Exception as e:
    print(f"Error during quantization: {e}")

# Move back to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe.to(device)

# Example usage
prompt = "A serene landscape with mountains and a lake"
image = pipe(prompt).images[0]
image.save("generated_image.png")

Submodule sizes before quantization:
transformer_blocks: 12315.36 MB
single_transformer_blocks: 10262.47 MB
transformer_blocks.0: 648.18 MB
transformer_blocks.1: 648.18 MB
transformer_blocks.2: 648.18 MB
transformer_blocks.3: 648.18 MB
transformer_blocks.4: 648.18 MB
transformer_blocks.5: 648.18 MB
transformer_blocks.6: 648.18 MB
transformer_blocks.7: 648.18 MB
transformer_blocks.8: 648.18 MB
transformer_blocks.9: 648.18 MB
transformer_blocks.10: 648.18 MB
transformer_blocks.11: 648.18 MB
transformer_blocks.12: 648.18 MB
transformer_blocks.13: 648.18 MB
transformer_blocks.14: 648.18 MB
transformer_blocks.15: 648.18 MB
transformer_blocks.16: 648.18 MB
transformer_blocks.17: 648.18 MB
transformer_blocks.18: 648.18 MB
single_transformer_blocks.0: 270.06 MB
single_transformer_blocks.1: 270.06 MB
single_transformer_blocks.2: 270.06 MB
single_transformer_blocks.3: 270.06 MB
single_transformer_blocks.4: 270.06 MB
single_transformer_blocks.5: 270.06 MB
single_transformer_blocks.6: 270.06 MB
si

## b

In [None]:
import torch
from torch.ao.quantization import quantize_dynamic
import gc
import numpy as np
import torch
from diffusers import FluxPipeline

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

def check_submodule_sizes(model, parent_name=''):
    sizes = []
    for name, submodule in model.named_children():
        full_name = f"{parent_name}.{name}" if parent_name else name
        size_mb = get_model_size(submodule)
        sizes.append((full_name, size_mb))
        sizes.extend(check_submodule_sizes(submodule, full_name))
    return sizes

def quantize_module(module, dtype=torch.qint8):
    return quantize_dynamic(
        module,
        {torch.nn.Linear},
        dtype=dtype
    )

def recursive_quantize(module, dtype=torch.qint8, size_threshold_mb=10, parent_name=''):
    quantized = False
    for name, submodule in module.named_children():
        full_name = f"{parent_name}.{name}" if parent_name else name
        size_mb = get_model_size(submodule)
        
        if size_mb > size_threshold_mb:
            print(f"Quantizing {full_name} ({size_mb:.2f} MB)...")
            if list(submodule.children()):  # If submodule has children
                submodule = recursive_quantize(submodule, dtype, size_threshold_mb, full_name)
            else:
                submodule = quantize_module(submodule, dtype)
            setattr(module, name, submodule)
            quantized = True
            gc.collect()
            torch.cuda.empty_cache()
        else:
            print(f"Skipping {full_name} ({size_mb:.2f} MB) - below threshold")
    
    if quantized:
        return module
    else:
        return quantize_module(module, dtype)

def quantize_transformer_recursive(transformer, dtype=torch.qint8, size_threshold_mb=10):
    sizes_before = check_submodule_sizes(transformer)
    sizes_before.sort(key=lambda x: x[1], reverse=True)
    
    # print("Submodule sizes before quantization:")
    # for name, size in sizes_before:
    #     print(f"{name}: {size:.2f} MB")
    
    total_size_before = sum(size for _, size in sizes_before)
    print(f"\nTotal model size before quantization: {total_size_before:.2f} MB")
    
    transformer = recursive_quantize(transformer, dtype, size_threshold_mb)
    
    sizes_after = check_submodule_sizes(transformer)
    total_size_after = sum(size for _, size in sizes_after)
    print(f"\nTotal model size after quantization: {total_size_after:.2f} MB")
    print(f"Size reduction: {(1 - total_size_after/total_size_before)*100:.2f}%")
    
    return transformer

# Assuming 'pipe' is your pipeline object
# Move to CPU for quantization
pipe.to('cpu')

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

try:
    # Recursively check sizes and quantize transformer submodules
    pipe.transformer = quantize_transformer_recursive(pipe.transformer, size_threshold_mb=10)
    print("Quantization successful")
except Exception as e:
    print(f"Error during quantization: {e}")

# Move back to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe.to(device)

# Example usage
prompt = "A serene landscape with mountains and a lake"
image = pipe(prompt).images[0]
image.save("generated_image.png")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Total model size before quantization: 92929.16 MB
Skipping pos_embed (0.00 MB) - below threshold
Quantizing time_text_embed (42.02 MB)...
Skipping time_text_embed.time_proj (0.00 MB) - below threshold
Quantizing time_text_embed.timestep_embedder (19.51 MB)...
Skipping time_text_embed.timestep_embedder.linear_1 (1.51 MB) - below threshold
Skipping time_text_embed.timestep_embedder.act (0.00 MB) - below threshold
Quantizing time_text_embed.timestep_embedder.linear_2 (18.01 MB)...
Quantizing time_text_embed.text_embedder (22.51 MB)...
Skipping time_text_embed.text_embedder.linear_1 (4.51 MB) - below threshold
Skipping time_text_embed.text_embedder.act_1 (0.00 MB) - below threshold
Quantizing time_text_embed.text_embedder.linear_2 (18.01 MB)...
Quantizing context_embedder (24.01 MB)...
Skipping x_embedder (0.38 MB) - below threshold
Quantizing transformer_blocks (12315.36 MB)...
Quantizing transformer_blocks.0 (648.18 MB)...
Quantizing transformer_blocks.0.norm1 (108.04 MB)...
Skipping tr

## c

In [1]:
from diffusers import StableDiffusionPipeline
import torch

pipe_uq = StableDiffusionPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16
)
pipe_uq = pipe_uq.to("cuda")

prompt = "a photo of an astronaut riding a horse on mars"
image = pipe_uq(prompt).images[0]  
    
image.save("as_uq1.png")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/50 [00:00<?, ?it/s]

In [1]:
from diffusers import StableDiffusionPipeline
import torch

pipe_uq = StableDiffusionPipeline.from_pretrained(
    ".quantized",
    torch_dtype=torch.float16
)
pipe_uq = pipe_uq.to("cuda")

prompt = "a photo of an astronaut riding a horse on mars"
image = pipe_uq(prompt).images[0]  
    
image.save("as_q1.png")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/50 [00:00<?, ?it/s]

In [56]:
import torch
from torch.ao.quantization import quantize_dynamic
import gc
import numpy as np
def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

def check_submodule_sizes(model, parent_name=''):
    sizes = []
    for name, submodule in model.named_children():
        full_name = f"{parent_name}.{name}" if parent_name else name
        size_mb = get_model_size(submodule)
        sizes.append((full_name, size_mb))
        sizes.extend(check_submodule_sizes(submodule, full_name))
    return sizes

def quantize_module(module, dtype=torch.qint8):
    return quantize_dynamic(
        module,
        {torch.nn.Linear},
        dtype=dtype
    )

def recursive_quantize(module, dtype=torch.qint8, size_threshold_mb=10, parent_name=''):
    quantized = False
    for name, submodule in module.named_children():
        full_name = f"{parent_name}.{name}" if parent_name else name
        size_mb = get_model_size(submodule)
        
        if size_mb > size_threshold_mb:
            print(f"Quantizing {full_name} ({size_mb:.2f} MB)...")
            if list(submodule.children()):  # If submodule has children
                submodule = recursive_quantize(submodule, dtype, size_threshold_mb, full_name)
            else:
                submodule = quantize_module(submodule, dtype)
            setattr(module, name, submodule)
            quantized = True
            gc.collect()
            torch.cuda.empty_cache()
        else:
            print(f"Skipping {full_name} ({size_mb:.2f} MB) - below threshold")
    
    if quantized:
        return module
    else:
        return quantize_module(module, dtype)

def quantize_transformer_recursive(transformer, dtype=torch.qint8, size_threshold_mb=10):
    sizes_before = check_submodule_sizes(transformer)
    sizes_before.sort(key=lambda x: x[1], reverse=True)
    
    total_size_before = sum(size for _, size in sizes_before)
    print(f"\nTotal model size before quantization: {total_size_before:.2f} MB")
    
    transformer = recursive_quantize(transformer, dtype, size_threshold_mb)
    
    sizes_after = check_submodule_sizes(transformer)
    total_size_after = sum(size for _, size in sizes_after)
    print(f"\nTotal model size after quantization: {total_size_after:.2f} MB")
    print(f"Size reduction: {(1 - total_size_after/total_size_before)*100:.2f}%")
    
    return transformer


# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

try:
    # Recursively check sizes and quantize transformer submodules
    pipe.transformer = quantize_transformer_recursive(pipe.vae, size_threshold_mb=10)
    print("Quantization successful")
except Exception as e:
    print(f"Error during quantization: {e}")

# Move back to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe.to(device)

# Example usage
prompt = "A serene landscape with mountains and a lake"
image = pipe(prompt).images[0]
image.save("generated_image_1.png")

Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.
Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, due to the lack of support for`float16` operations on this device in PyTorch. Please, remove the `torch_dtype=torch.float16` argument, or use another device for inference.
Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It is not recommended to move them to `cpu` as running them will fail. Please make sure to use an accelerator to run the pipeline in inference, du


Total model size before quantization: 917.82 MB
Quantizing encoder (65.16 MB)...
Skipping encoder.conv_in (0.01 MB) - below threshold
Quantizing encoder.down_blocks (45.07 MB)...
Skipping encoder.down_blocks.0 (1.41 MB) - below threshold
Skipping encoder.down_blocks.1 (5.13 MB) - below threshold
Quantizing encoder.down_blocks.2 (20.51 MB)...
Quantizing encoder.down_blocks.2.resnets (16.01 MB)...
Skipping encoder.down_blocks.2.resnets.0 (7.01 MB) - below threshold
Skipping encoder.down_blocks.2.resnets.1 (9.01 MB) - below threshold
Skipping encoder.down_blocks.2.downsamplers (4.50 MB) - below threshold
Quantizing encoder.down_blocks.3 (18.01 MB)...
Quantizing encoder.down_blocks.3.resnets (18.01 MB)...
Skipping encoder.down_blocks.3.resnets.0 (9.01 MB) - below threshold
Skipping encoder.down_blocks.3.resnets.1 (9.01 MB) - below threshold
Quantizing encoder.mid_block (20.02 MB)...
Skipping encoder.mid_block.attentions (2.01 MB) - below threshold
Quantizing encoder.mid_block.resnets (18.

  0%|          | 0/50 [00:00<?, ?it/s]

In [48]:
def get_all_modules_name(pipe):
    res = []
    for k in pipe.__dict__:
        if isinstance(getattr(pipe, k), torch.nn.modules.module.Module):
            res.append(k)
    return res

In [52]:
res = get_all_modules_name(pipe)

In [54]:
t = 0
for m in res:
    s = get_model_size(getattr(pipe, m))
    t += s
    print(m,s)

vae 159.55708122253418
text_encoder 234.7198257446289
unet 1639.406135559082
safety_checker 579.8008270263672


In [58]:
pipe.save_pretrained(".quantized")

In [60]:
prompt = "a photo of an astronaut riding a horse on mars"
image = pipe(prompt).images[0]  
    
image.save("astr_quantized.png")


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
class Quantizater:
    def __init__(self):
        self.type = None
    def set_model_name(self, name):
        self.type = "model"
        self.model_name = name
    def set_pipeline(self, pipeline):
        self.type = "pipe"
        self.pipeline = pipeline
        self.modules = self.get_all_modules_name(pipeline)
    def execute_recursively(self):
        if self.type == "pipe":
            for module in self.modules:
                self.recursive_quantize(module, )
        elif self.type == "model":
            self.quantize_model()
    def execute(self):
        pass

    def quantize_model(self,use_4bit=True, bnb_4bit_compute_dtype="float16", 
                       bnb_4bit_quant_type= "nf4", use_nested_quant=False ):
        compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=use_4bit,
            bnb_4bit_quant_type=bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=use_nested_quant,
        )
        
        # Check GPU compatibility with bfloat16
        if compute_dtype == torch.float16 and use_4bit:
            major, _ = torch.cuda.get_device_capability()
            if major >= 8:
                print("=" * 80)
                print("Your GPU supports bfloat16: accelerate training with bf16=True")
                print("=" * 80)
                
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=bnb_config,
        )
    def get_all_modules_name(self, pipe):
        res = []
        for k in pipe.__dict__:
            if isinstance(getattr(pipe, k), torch.nn.modules.module.Module):
                res.append(k)
        return res
    def get_model_size(self, model):
        param_size = 0
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
        buffer_size = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
        size_all_mb = (param_size + buffer_size) / 1024**2
        return size_all_mb
    def quantize_module(self, module, dtype=torch.qint8):
        return quantize_dynamic(
            module,
            {torch.nn.Linear},
            dtype=dtype
        )
    def recursive_quantize(self, module, dtype=torch.qint8, size_threshold_mb=10, parent_name=''):
        quantized = False
        for name, submodule in module.named_children():
            full_name = f"{parent_name}.{name}" if parent_name else name
            size_mb = get_model_size(submodule)
            
            if size_mb > size_threshold_mb:
                print(f"Quantizing {full_name} ({size_mb:.2f} MB)...")
                if list(submodule.children()):  # If submodule has children
                    submodule = self.recursive_quantize(submodule, dtype, size_threshold_mb, full_name)
                else:
                    submodule = self.quantize_module(submodule, dtype)
                setattr(module, name, submodule)
                quantized = True
                gc.collect()
                torch.cuda.empty_cache()
            else:
                print(f"Skipping {full_name} ({size_mb:.2f} MB) - below threshold")
        
        if quantized:
            return module
        else:
            return self.quantize_module(module, dtype)
    def save(self, output_path):
        if self.type == "pipe":
            self.pipeline.save_pretrained(output_path)
        elif self.type == "model":
            self.model.save_pretrained(output_path)
        else:
            print("nothing to save")

## d

In [6]:
checkpoint_val = 0
import time
from time import strftime, localtime
checkpoint_prev_time = None
def convert_to_readable_format(t):
    return strftime('%Y-%m-%d %H:%M:%S', localtime(1347517370))

def checkpoint():
    global checkpoint_val
    if checkpoint_prev_time is None:
        checkpoint_prev_time = time.time()
        print(checkpoint_val, "started", convert_to_readable_format(checkpoint_prev_time))
    else:
        print(checkpoint_val, "delta time", convert_to_readable_format(time.time()  - checkpoint_prev_time))
        checkpoint_prev_time = time.time()
    checkpoint_val += 1

In [8]:
from diffusers import FluxTransformer2DModel
from torchao.quantization.quant_api import quanti_, int8_weight_only
import torch

checkpoint()
ckpt_id = "black-forest-labs/FLUX.1-schnell"

transformer = FluxTransformer2DModel.from_pretrained(
    ckpt_id, subfolder="transformer", torch_dtype=torch.bfloat16
)
checkpoint()
quantize_(transformer, int8_weight_only())
checkpoint()
output_dir = "./flux-schnell-int8wo"
transformer.save_pretrained(output_dir, safe_serialization=False)
checkpoint()
# Push to the Hub optionally.
# save_to = "sayakpaul/flux-schnell-int8wo"
# transformer.push_to_hub(save_to, safe_serialization=False)

ImportError: cannot import name 'quanti_' from 'torchao.quantization.quant_api' (C:\Users\rajab\miniconda3\envs\py3_120\Lib\site-packages\torchao\quantization\quant_api.py)

In [1]:
from torchao.quantization.quant_api import (
    quantize_,
    int8_dynamic_activation_int4_weight,
    int8_dynamic_activation_int8_weight,
    int8_dynamic_activation_int8_semi_sparse_weight,
    int4_weight_only,
    int8_weight_only
)


ImportError: cannot import name 'quantize_' from 'torchao.quantization.quant_api' (C:\Users\rajab\miniconda3\envs\py3_120\Lib\site-packages\torchao\quantization\quant_api.py)