In [2]:
%cd /content/drive/MyDrive/ECS 289L SQ 2024/project

/content/drive/MyDrive/ECS 289L SQ 2024/project


In [3]:
!pip install accelerate datasets peft
!pip install git+https://github.com/huggingface/diffusers

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB

## LoRA Finetuning

In [4]:
import os
import math
import random

import datasets
import numpy as np
import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from peft import LoraConfig
from peft.utils import get_peft_model_state_dict
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer

import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
from diffusers.training_utils import cast_training_params, compute_snr
from diffusers.utils import check_min_version, convert_state_dict_to_diffusers
from diffusers.utils.torch_utils import is_compiled_module

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


In [5]:
def training_loop(output_dir, dataset_name, max_train_steps):
  set_seed(2024)
  accelerator_project_config = ProjectConfiguration(project_dir=output_dir, logging_dir=output_dir)
  gradient_accumulation_steps = 4
  accelerator = Accelerator(
    gradient_accumulation_steps=gradient_accumulation_steps,
    mixed_precision='no',
    project_config=accelerator_project_config,
  )
  os.makedirs(output_dir, exist_ok=True)
  model_name = "runwayml/stable-diffusion-v1-5"

  noise_scheduler = DDPMScheduler.from_config(model_name, subfolder="scheduler")
  tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
  text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder")
  vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae")
  unet = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet")
  unet.requires_grad_(False)
  vae.requires_grad_(False)
  text_encoder.requires_grad_(False)
  weight_dtype = torch.float32
  for param in unet.parameters():
    param.requires_grad_(False)
  unet_lora_config = LoraConfig(
      r=4, lora_alpha=4, init_lora_weights="gaussian",
      target_modules=["to_k", "to_q", "to_v", "to_out.0"]
  )
  unet.to(accelerator.device, dtype=weight_dtype)
  vae.to(accelerator.device, dtype=weight_dtype)
  text_encoder.to(accelerator.device, dtype=weight_dtype)

  unet.add_adapter(unet_lora_config)
  # cast_training_params(unet, dtype=torch.float32)

  lora_layers = filter(lambda p: p.requires_grad, unet.parameters())

  optimizer = torch.optim.AdamW(
    lora_layers, lr=1e-4, betas=(0.9, 0.999),
    weight_decay=1e-2, eps=1e-08)

  dataset = load_dataset(dataset_name)

  column_names = dataset["train"].column_names
  image_column = "image"
  caption_column = "caption"

  def tokenize_captions(examples, is_train=True):
    captions = []
    for caption in examples[caption_column]:
      captions.append(caption)
    inputs = tokenizer(
      captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return inputs.input_ids

  resolution = 512
  train_transforms = transforms.Compose(
    [
      transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR),
      transforms.CenterCrop(resolution),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize([0.5], [0.5]),
    ]
  )

  def unwrap_model(model):
    model = accelerator.unwrap_model(model)
    model = model._orig_mod if is_compiled_module(model) else model
    return model

  def preprocess_train(examples):
    images = [image.convert("RGB") for image in examples[image_column]]
    examples["pixel_values"] = [train_transforms(image) for image in images]
    examples["input_ids"] = tokenize_captions(examples)
    return examples

  with accelerator.main_process_first():
    train_dataset = dataset["train"].with_transform(preprocess_train)

  def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
    input_ids = torch.stack([example["input_ids"] for example in examples])
    return {"pixel_values": pixel_values, "input_ids": input_ids}

  train_batch_size = 1
  train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=collate_fn,
    batch_size=train_batch_size,
    num_workers=8
  )

  lr_warmup_steps = 0
  num_warmup_steps_for_scheduler = lr_warmup_steps * accelerator.num_processes
  num_training_steps_for_scheduler = max_train_steps * accelerator.num_processes

  lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps_for_scheduler,
    num_training_steps=num_training_steps_for_scheduler,
  )

  unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    unet, optimizer, train_dataloader, lr_scheduler
  )

  num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
  num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)

  total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps

  global_step = 0
  first_epoch = 0
  initial_global_step = 0

  progress_bar = tqdm(
    range(0, max_train_steps),
    initial=initial_global_step,
    desc="Steps",
    # Only show the progress bar once on each machine.
    disable=not accelerator.is_local_main_process,
  )

  for epoch in range(first_epoch, num_train_epochs):
    unet.train()
    train_loss = 0.0
    for step, batch in enumerate(train_dataloader):
      with accelerator.accumulate(unet):
        latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
        noise = torch.randn_like(latents)
        bsz = latents.shape[0]
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
        timesteps = timesteps.long()
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
        encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
        target = noise
        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
        avg_loss = accelerator.gather(loss.repeat(train_batch_size)).mean()
        train_loss += avg_loss.item() / gradient_accumulation_steps

        max_grad_norm = 1
        accelerator.backward(loss)
        if accelerator.sync_gradients:
          params_to_clip = lora_layers
          accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if accelerator.sync_gradients:
          progress_bar.update(1)
          global_step += 1
          accelerator.log({"train_loss": train_loss}, step=global_step)
          train_loss = 0.0
        logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
        progress_bar.set_postfix(**logs)

        if global_step >= max_train_steps:
          break

  accelerator.wait_for_everyone()
  if accelerator.is_main_process:
    unet = unet.to(torch.float32)
    unwrapped_unet = unwrap_model(unet)
    unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unwrapped_unet))
    StableDiffusionPipeline.save_lora_weights(
        save_directory=output_dir,
        unet_lora_layers=unet_lora_state_dict,
        safe_serialization=True,
    )
  accelerator.end_training()

In [6]:
from accelerate import notebook_launcher

output_dir = "./outputs"
dataset_name = "./data/Images"
max_train_steps = 1
args = (output_dir, dataset_name, max_train_steps)
notebook_launcher(training_loop, args)

Launching training on one GPU.


  deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


scheduler/scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/8092 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/8092 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]



Steps:   0%|          | 0/1 [00:00<?, ?it/s]

  self.pid = os.fork()


## Inference

In [7]:
import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

safety_checker/config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [8]:
pipe.unet.load_attn_procs("./outputs/pytorch_lora_weights.safetensors")
pipe.to("cuda")

image = pipe("little boy", num_inference_steps=25).images[0]
image.save("./images/little_boy.png")

  0%|          | 0/25 [00:00<?, ?it/s]

  return F.conv2d(input, weight, bias, self.stride,
