# Notebook to use Stable Diffusion to generate a stylised self icon

In [None]:
# !pip install -Uq diffusers transformers fastcore

## Using Stable Diffusion

In [None]:
from PIL import Image
from fastcore.all import concat
import torch, logging
from pathlib import Path
from huggingface_hub import notebook_login
from diffusers import StableDiffusionPipeline
from PIL import Image
import matplotlib.pyplot as plt
logging.disable(logging.WARNING)

torch.manual_seed(1)
if not (Path.home()/'.huggingface'/'token').exists(): notebook_login()

### Stable Diffusion Pipeline

In [None]:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16).to("cuda")

The weights are cached in your home directory by default.

In [None]:
!ls ~/.cache/huggingface/diffusers/

We are now ready to use the pipeline to start creating images.

In [None]:
def image_grid(imgs, rows, cols):
    w,h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    for i, img in enumerate(imgs): grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

### Image to Image

In [None]:
from diffusers import StableDiffusionImg2ImgPipeline
from fastdownload import FastDownload

In [None]:
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    revision="fp16",
    torch_dtype=torch.float16,
).to("cuda")

### Create stylised images of self

In [None]:
john_image_1 = Image.open("/home/images/John_conf.jpg")
john_image_2 = Image.open("/home/images/john_office.jpg")

In [None]:
# Style prompts
prompt_1 = "Portrait of a man in the style of Van Gogh"
prompt_2 = "Portrait of a man in the style of Anime"
prompt_3 = "Portrait of a man in the style of Vermeer"
prompt_4 = "Watercolor portraint of a man"

In [None]:
def generate_image_set(prompt, image, strengths, guidance_scale=None, seed=1000, num_inference_steps=50):
    images=[]
    if guidance_scale is None:
        for strength in strengths:
            torch.manual_seed(seed)
            images = concat([images, pipe(
                prompt=prompt,
                num_images_per_prompt=3,
                init_image=image,
                strength=strength,
            ).images])
        
    else:
        for strength in strengths:
            torch.manual_seed(seed)
            images = concat([images, pipe(
                prompt=prompt,
                num_images_per_prompt=3,
                init_image=image,
                strength=strength,
                guidance_scale=guidance_scale
            ).images])
    return images

In [None]:
best_images = []

#### Prompt 1, gs 7.5

In [None]:
images_p1_gs75 = generate_image_set(prompt=prompt_1, image=john_image_1, strengths=(0.3, 0.55, 0.7), guidance_scale=7.5)
image_grid(images_p1_gs75, rows=3, cols=3)

#### Prompt 1, gs 5

In [None]:
images_p1_gs5 = generate_image_set(prompt=prompt_1, image=john_image_1, strengths=(0.3, 0.55, 0.7), guidance_scale=5)
image_grid(images_p1_gs3, rows=3, cols=3)

In [None]:
best_images.append(images_p1_gs5[-1])

#### Prompt 1, gs 3

In [None]:
images_p1_gs3 = generate_image_set(prompt=prompt_1, image=john_image_1, strengths=(0.3, 0.55, 0.7), guidance_scale=3, num_inference_steps=70)
image_grid(images_p1_gs1, rows=3, cols=3)

#### Prompt 1, gs 1

In [None]:
images_p1_gs1 = generate_image_set(prompt=prompt_1, image=john_image_1, strengths=(0.3, 0.55, 0.7), guidance_scale=1)
image_grid(images_p1_gs1, rows=3, cols=3)

#### Prompt 2, no gs

In [None]:
images_p2_ng = generate_image_set(prompt=prompt_2, image=john_image_1, strengths=(0.3, 0.55, 0.7))
image_grid(images_p2_ng, rows=3, cols=3)

#### Prompt 2, gs 5

In [None]:
images_p2_5 = generate_image_set(prompt=prompt_2, image=john_image_1, strengths=(0.3, 0.55, 0.7), guidance_scale=5.0)
image_grid(images_p2_5, rows=3, cols=3)

#### Prompt 3, gs 7.5

In [None]:
images_p3_ng = generate_image_set(prompt=prompt_3, image=john_image_1, strengths=(0.3, 0.55, 0.7))
image_grid(images_p3_ng, rows=3, cols=3)

#### Prompt 4, gs 7.5

In [None]:
images_p4_75 = generate_image_set(prompt=prompt_4, image=john_image_1, strengths=(0.3, 0.55, 0.7), guidance_scale=7.5)
image_grid(images_p4_75, rows=3, cols=3)

#### Experiment to try and remove beard

In [None]:
images_p1a_75 = generate_image_set(prompt="portrait of a man without a beard in the style of Van Gogh", image=john_image_1, 
                                  strengths=(0.3, 0.55, 0.7), guidance_scale=7.5)
image_grid(images_p1a_75, rows=3, cols=3)

#### Best of image 1

In [None]:
best_images = {"p1_gs75_n8": images_p1_gs75[-1], 
               "p1_gs5_n7": images_p1_gs5[-2],
               "p1_gs5_n8": images_p1_gs5[-1], 
               "p1_gs3_n5": images_p1_gs3[-4],
               "p1_gs3_n7": images_p1_gs3[-2], 
               "p1_gs3_n8": images_p1_gs3[-1],
               "p1_gs1_n8": images_p1_gs1[-1],
               "p3_ng_n5": images_p2_ng[-4],
               "p3_ng_n7": images_p3_ng[-8],
               "p4_g75_n4": images_p4_75[4],
               "p4_g75_n6": images_p4_75[6],              
              }
export_path = Path("/home/generated_images/")
for name, image in best_images.items():
    image.save(export_path/(name+".png"), "png")

In [None]:
img = images_p1_gs75[-1]

In [None]:
export_path = Path("/home/generated_images/")
img.save(export_path/"test.png",'png')

### Image 2

In [None]:
generate_image_set(prompt=prompt_1, image=john_image_2, strengths=(0.3, 0.4, 0.55, 0.7))

In [None]:
images = generate_image_set(prompt=prompt_1, image=john_image_2, strengths=(0.3, 0.4, 0.55, 0.7))
image_grid(images, rows=len(images), cols=3)

In [None]:
images = generate_image_set(prompt=prompt_2, image=john_image_2, strengths=(0.3, 0.4, 0.55, 0.7))
image_grid(images, rows=len(images), cols=3)

In [None]:
images = generate_image_set(prompt=prompt_3, image=john_image_2, strengths=(0.3, 0.4, 0.55, 0.7))
image_grid(images, rows=len(images), cols=3)

In [None]:
prompt = prompt_1
images=[]
for strength in (0.3, 0.4, 0.55, 0.7):
    torch.manual_seed(1000)
    images = concat([images, pipe(
        prompt=prompt,
        num_images_per_prompt=3,
        init_image=john_image_1,
        strength=strength,
    ).images])
image_grid(images, rows=4, cols=3)
    

In [None]:
torch.manual_seed(1000)
images = pipe(
    prompt=prompt,
    num_images_per_prompt=3,
    init_image=fred_image,
    strength=0.3,
    num_inference_steps=50,
).images
image_grid(images, rows=1, cols=3)

In [None]:
torch.manual_seed(1000)
images = pipe(
    prompt=prompt,
    num_images_per_prompt=3,
    init_image=fred_image,
    strength=0.2,
    num_inference_steps=50,
).images
image_grid(images, rows=1, cols=3)

In [None]:
torch.manual_seed(1000)
images = pipe(
    prompt=prompt,
    num_images_per_prompt=3,
    init_image=fred_image,
    strength=0.4,
    num_inference_steps=50,
).images
image_grid(images, rows=1, cols=3)

In [None]:
torch.manual_seed(1000)
images = pipe(
    prompt=prompt,
    num_images_per_prompt=3,
    init_image=fred_image,
    strength=0.4,
    num_inference_steps=50,
).images
image_grid(images, rows=1, cols=3)

In [None]:
from transformers import CLIPTextModel, CLIPTokenizer

In [None]:
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")

Next we'll load the `vae` and the `unet`. These are distinct models whose weights are stored inside folders of the Stable Diffusion repository. We can use the `subfolder` argument to refer to [these locations](https://huggingface.co/CompVis/stable-diffusion-v1-4/tree/main).

In [None]:
from diffusers import AutoencoderKL, UNet2DConditionModel

In [None]:
# Here we use a different VAE to the original release, which has been fine-tuned for more steps
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema", torch_dtype=torch.float16).to("cuda")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to("cuda")

To make things a bit different, we'll use another scheduler. The standard pipeline uses the [PNDM Scheduler](https://arxiv.org/abs/2202.09778), but we'll use [Katherine Crowson's](https://github.com/crowsonkb) excellent K-LMS scheduler.

We need to be careful to use the same noising schedule that was used during training. The schedule is defined by the number of noising steps and the amount of noise added at each step, which is derived from the _beta_ parameters.

In the case of the k-LMS scheduler, this is how the betas evolve during the 1000 steps of the noising process used during training:

In [None]:
beta_start,beta_end = 0.00085,0.012
plt.plot(torch.linspace(beta_start**0.5, beta_end**0.5, 1000) ** 2)
plt.xlabel('Timestep')
plt.ylabel('β');

In [None]:
from diffusers import LMSDiscreteScheduler

In [None]:
scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear", num_train_timesteps=1000)

We now define the parameters we'll use for generation.

In contrast with the previous examples, we set `num_inference_steps` to 70 to get an even more defined image.

In [None]:
prompt = ["a photograph of an astronaut riding a horse"]

height = 512
width = 512
num_inference_steps = 70
guidance_scale = 7.5
batch_size = 1

We tokenize the prompt. The model requires the same number of tokens for every prompt, so padding is used to ensure we meet the required length.

In [None]:
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
text_input['input_ids']

In [None]:
tokenizer.decode(49407)

The attention mask uses zero to represent tokens we are not interested in. These are all of the padding tokens.

In [None]:
text_input['attention_mask']

The text encoder gives us the embeddings for the text prompt we used.

In [None]:
text_embeddings = text_encoder(text_input.input_ids.to("cuda"))[0].half()
text_embeddings.shape

We also get the embeddings required to perform unconditional generation, which is achieved with an empty string: the model is free to go in whichever direction it wants as long as it results in a reasonably-looking image. These embeddings will be applied to apply classifier-free guidance.

In [None]:
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer(
    [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
)
uncond_embeddings = text_encoder(uncond_input.input_ids.to("cuda"))[0].half()
uncond_embeddings.shape

For classifier-free guidance, we need to do two forward passes. One with the conditioned input (`text_embeddings`), and another with the unconditional embeddings (`uncond_embeddings`). In practice, we can concatenate both into a single batch to avoid doing two forward passes.

In [None]:
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

To start the denoising process, we start from pure Gaussian (normal) noise. These are our initial latents.

In [None]:
torch.manual_seed(100)
latents = torch.randn((batch_size, unet.in_channels, height // 8, width // 8))
latents = latents.to("cuda").half()
latents.shape

`4×64×64` is the input shape. The decoder will later transform this latent representation into a `3×512×512` image after the denoising process is complete.

Next, we initialize the scheduler with our chosen `num_inference_steps`. This will prepare the internal state to be used during denoising.

In [None]:
scheduler.set_timesteps(num_inference_steps)

We scale the initial noise by the standard deviation required by the scheduler. This value will depend on the particular scheduler we use.

In [None]:
latents = latents * scheduler.init_noise_sigma

We are ready to write the denoising loop. The timesteps go from `999` to `0` (1000 steps that were used during training) following a particular schedule.

In [None]:
scheduler.timesteps

In [None]:
scheduler.sigmas

In [None]:
plt.plot(scheduler.timesteps, scheduler.sigmas[:-1]);

In [None]:
from tqdm.auto import tqdm

In [None]:
for i, t in enumerate(tqdm(scheduler.timesteps)):
    input = torch.cat([latents] * 2)
    input = scheduler.scale_model_input(input, t)

    # predict the noise residual
    with torch.no_grad(): pred = unet(input, t, encoder_hidden_states=text_embeddings).sample

    # perform guidance
    pred_uncond, pred_text = pred.chunk(2)
    pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)

    # compute the "previous" noisy sample
    latents = scheduler.step(pred, t, latents).prev_sample

After this process complets our `latents` contain the denoised representation of the image. We use the `vae` decoder to convert it back to pixel space.

In [None]:
with torch.no_grad(): image = vae.decode(1 / 0.18215 * latents).sample

And finally, let's convert the image to PIL so we can display it.

In [None]:
image = (image / 2 + 0.5).clamp(0, 1)
image = image[0].detach().cpu().permute(1, 2, 0).numpy()
image = (image * 255).round().astype("uint8")
Image.fromarray(image)

### Just the code

In [None]:
prompts = [
    'a photograph of an astronaut riding a horse',
    'an oil painting of an astronaut riding a horse in the style of grant wood'
]

In [None]:
text_input = tokenizer(prompts, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
text_embeddings = text_encoder(text_input.input_ids.to("cuda"))[0].half()

In [None]:
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer([""] * len(prompts), padding="max_length", max_length=max_length, return_tensors="pt")
uncond_embeddings = text_encoder(uncond_input.input_ids.to("cuda"))[0].half()
emb = torch.cat([uncond_embeddings, text_embeddings])

In [None]:
torch.manual_seed(100)
g = guidance_scale

In [None]:
latents = torch.randn((len(prompts), unet.in_channels, height//8, width//8))
scheduler.set_timesteps(num_inference_steps)
latents = latents.to("cuda").half() * scheduler.init_noise_sigma

In [None]:
for i,ts in enumerate(tqdm(scheduler.timesteps)):
    inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
    with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
    pred = u + g*(t-u)
    latents = scheduler.step(pred, ts, latents).prev_sample

In [None]:
with torch.no_grad(): image = vae.decode(1 / 0.18215 * latents).sample
res = (image / 2 + 0.5).clamp(0, 1)

In [None]:
image = res[0].detach().cpu().permute(1, 2, 0).numpy()
image = (image * 255).round().astype("uint8")
Image.fromarray(image)

In [None]:
image = res[1].detach().cpu().permute(1, 2, 0).numpy()
image = (image * 255).round().astype("uint8")
Image.fromarray(image)

### Put it in functions

In [None]:
def text_enc(prompts, maxlen=None):
    if maxlen is None: maxlen = tokenizer.model_max_length
    inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
    return text_encoder(inp.input_ids.to("cuda"))[0].half()

def mk_img(t):
    image = (t/2+0.5).clamp(0,1).detach().cpu().permute(1, 2, 0).numpy()
    return Image.fromarray((image*255).round().astype("uint8"))

In [None]:
def mk_samples(prompts, g=7.5, seed=100, steps=70):
    bs = len(prompts)
    text = text_enc(prompts)
    uncond = text_enc([""] * bs, text.shape[1])
    emb = torch.cat([uncond, text])
    if seed: torch.manual_seed(seed)

    latents = torch.randn((bs, unet.in_channels, height//8, width//8))
    scheduler.set_timesteps(steps)
    latents = latents.to("cuda").half() * scheduler.init_noise_sigma

    for i,ts in enumerate(tqdm(scheduler.timesteps)):
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
        with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
        pred = u + g*(t-u)
        latents = scheduler.step(pred, ts, latents).prev_sample

    with torch.no_grad(): return vae.decode(1 / 0.18215 * latents).sample

In [None]:
images = mk_samples(prompts)

In [None]:
from IPython.display import display

In [None]:
for img in images: display(mk_img(img))