In [8]:
import torch
from diffusers import UNet2DConditionModel, AutoencoderKL, DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from tqdm.auto import tqdm

vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema", torch_dtype=torch.float16).to("cuda")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to("cuda")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

def text_enc(prompts):
    inputs = tokenizer(prompts, padding="max_length", max_length=77, return_tensors="pt")
    return text_encoder(inputs.input_ids.to("cuda"))[0].to(torch.float16)

def mk_samples_with_negative_prompt(prompts, neg_prompts, g=7.5, seed=100, steps=70):
    bs = len(prompts)
    text = text_enc(prompts)
    neg_text = text_enc(neg_prompts)
    emb = torch.cat([neg_text, text]).to(torch.float16)
    if seed:
        torch.manual_seed(seed)
    height, width = 512, 512
    latents = torch.randn((bs, unet.config.in_channels, height // 8, width // 8), dtype=torch.float16).to("cuda")
    scheduler.set_timesteps(steps)
    latents = latents * scheduler.init_noise_sigma
    for i, ts in enumerate(tqdm(scheduler.timesteps)):
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
        with torch.no_grad():
            n, t = unet(inp, ts, encoder_hidden_states=emb.half()).sample.chunk(2)  # Ensure emb is half
        pred = n + g * (t - n)
        latents = scheduler.step(pred, ts, latents).prev_sample
    with torch.no_grad():
        return vae.decode(1 / 0.18215 * latents).sample

prompts = ["A little girl"]
neg_prompts = ["A little boy"]
generated_images = mk_samples_with_negative_prompt(prompts, neg_prompts, g=7.5, seed=42, steps=70)

from PIL import Image
for i, img in enumerate(generated_images):
    img = (img / 2 + 0.5).clamp(0, 1)
    img = (img * 255).cpu().numpy().astype("uint8").transpose(1, 2, 0)
    Image.fromarray(img).save(f"generated_image_{i}.png")

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


  0%|          | 0/70 [00:00<?, ?it/s]