In [1]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=96483b9da4831e1506c272d2d7353554cd27e758dd5900b9e90f4225feaa5a7a
  Stored in directory: /root/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
import wget
import tarfile
import os
from PIL import Image
from pathlib import Path

# Create directory
os.makedirs("impressionist_dataset", exist_ok=True)

# Download the dataset (256x256 version - good for LoRA/TI)
print("Downloading ArtBench-10...")
wget.download(
    url="https://artbench.eecs.berkeley.edu/files/artbench-10-imagefolder-split.tar",
    out="artbench-10.tar"
)

# Extract
print("\nExtracting...")
with tarfile.open("artbench-10.tar") as tar:
    tar.extractall("artbench_data")

# Copy Cubism images (class label 2)
source_dir = Path("artbench_data/artbench-10-imagefolder-split/test/impressionism")
dest_dir = Path("impressionist_dataset")

count = 0
for img_file in source_dir.glob("*.jpg"):
    if count < 50:  # Get first 50 images
        img = Image.open(img_file)
        img.save(dest_dir / f"impressionism_{count:03d}.jpg")
        count += 1

print(f"\nSaved {count} Cubist images to cubist_dataset/")

Downloading ArtBench-10...

Extracting...


  tar.extractall("artbench_data")



Saved 50 Cubist images to cubist_dataset/


In [6]:
import shutil

# Zip the dataset folder
shutil.make_archive('lora_output_20_9500_impressionlism', 'zip', 'lora_output')

print("Created impressionist_dataset.zip")

Created impressionist_dataset.zip


In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from diffusers import StableDiffusionPipeline, DDPMScheduler
from diffusers.optimization import get_scheduler
from torchvision import transforms
from tqdm import tqdm

from peft import LoraConfig, get_peft_model


# ============================================
# 1. Config
# ============================================
model_id = "sd-legacy/stable-diffusion-v1-5"
dataset_dir = "impressionist_dataset"
lora_rank = 8
train_steps = 9500
learning_rate = 1e-4
batch_size = 1
resolution = 512
output_dir = "./lora_output"

device = "mps" if torch.backends.mps.is_available() else "cuda"
print("Using device:", device)


# ============================================
# 2. Load model (Stable Diffusion v1.5)
# ============================================
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float32 if device == "mps" else torch.float16,
)
pipe.to(device)

# Freeze base model
pipe.unet.requires_grad_(False)

# ============================================
# 3. Apply LoRA to UNet cross-attention layers
# ============================================
lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_rank * 2,
    target_modules=["to_q", "to_k", "to_v", "to_out.0"],
    lora_dropout=0.0,
    bias="none",
)

pipe.unet = get_peft_model(pipe.unet, lora_config)
pipe.unet.print_trainable_parameters()

# Extract tokenizer and text_encoder for training loop
tokenizer = pipe.tokenizer
text_encoder = pipe.text_encoder

Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


trainable params: 1,594,368 || all params: 861,115,332 || trainable%: 0.1852


In [2]:
optimizer = torch.optim.Adam(pipe.unet.parameters(), lr=learning_rate)

noise_scheduler = DDPMScheduler.from_config(pipe.scheduler.config)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=train_steps,
)

In [3]:
from pathlib import Path

class ImpressionistDataset(Dataset):
    def __init__(self, root_dir, caption_file, resolution=512, num_images=None):
        self.root_dir = root_dir
        # Sort to ensure consistent subset selection
        self.image_paths = sorted(list(Path(root_dir).glob("*.jpg")))

        # Load captions from file
        with open(caption_file, "r") as f:
            self.captions = [line.strip() for line in f.readlines() if line.strip()]

        if num_images is not None:
            self.image_paths = self.image_paths[:num_images]
            # Ensure we don't index out of bounds if captions are fewer
            if len(self.captions) > num_images:
                 self.captions = self.captions[:num_images]
            print(f"Dataset initialized with {len(self.image_paths)} images and {len(self.captions)} captions.")

        self.transform = transforms.Compose([
            transforms.Resize((resolution, resolution)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        image = Image.open(path).convert("RGB")
        image = self.transform(image)

        # Get corresponding caption. Use modulo to cycle if fewer captions than images.
        if len(self.captions) > 0:
            caption = self.captions[idx % len(self.captions)] + ",<style1>"
        else:
            caption = "" # Fallback

        return image, caption

# Set the number of images you want to use for training here (e.g. 2, 4, 6, 10, 20, 50)
num_images_to_use = 20
dataset = ImpressionistDataset(dataset_dir, caption_file="caption.txt", resolution=resolution, num_images=num_images_to_use)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Dataset initialized with 20 images and 20 captions.


In [4]:
pipe.unet.train()

global_step = 0

for epoch in range(1000):
    for batch in dataloader:
        if global_step >= train_steps:
            break

        images, captions = batch
        # Fix: Cast images to the same dtype as the VAE (Float16) to prevent type mismatch error
        images = images.to(device, dtype=pipe.vae.dtype)

        # Encode text
        inputs = tokenizer(
            list(captions),
            padding="max_length",
            max_length=tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids = inputs.input_ids.to(device)
        encoder_hidden_states = text_encoder(input_ids)[0]

        # Add noise
        with torch.no_grad():
            latents = pipe.vae.encode(images).latent_dist.sample()
            latents = latents * pipe.vae.config.scaling_factor

        timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (batch_size,), device=device).long()

        # 2. Sample noise
        noise = torch.randn_like(latents)

        # 3. Add noise according to timestep
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        # Predict noise
        noise_pred = pipe.unet(noisy_latents, timesteps, encoder_hidden_states).sample

        loss = torch.nn.functional.mse_loss(noise_pred, noise)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        global_step += 1

        if global_step % 50 == 0:
            print(f"Step {global_step} / {train_steps}, Loss = {loss.item():.4f}")

    if global_step >= train_steps:
        break


# ============================================
# 7. Save LoRA weights
# ============================================
os.makedirs(output_dir, exist_ok=True)
pipe.unet.save_pretrained(output_dir)

print("Training finished! LoRA saved to:", output_dir)

  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)


Step 50 / 9500, Loss = 0.2742
Step 100 / 9500, Loss = 0.0315
Step 150 / 9500, Loss = 0.0140
Step 200 / 9500, Loss = 0.0356
Step 250 / 9500, Loss = 0.2380
Step 300 / 9500, Loss = 0.2651
Step 350 / 9500, Loss = 0.0864
Step 400 / 9500, Loss = 0.6304
Step 450 / 9500, Loss = 0.0063
Step 500 / 9500, Loss = 0.0939
Step 550 / 9500, Loss = 0.3313
Step 600 / 9500, Loss = 0.0083
Step 650 / 9500, Loss = 0.2874
Step 700 / 9500, Loss = 0.0144
Step 750 / 9500, Loss = 0.0219
Step 800 / 9500, Loss = 0.1575
Step 850 / 9500, Loss = 0.9062
Step 900 / 9500, Loss = 0.0206
Step 950 / 9500, Loss = 0.0133
Step 1000 / 9500, Loss = 0.0638
Step 1050 / 9500, Loss = 0.1865
Step 1100 / 9500, Loss = 0.0062
Step 1150 / 9500, Loss = 0.3779
Step 1200 / 9500, Loss = 0.2651
Step 1250 / 9500, Loss = 0.2302
Step 1300 / 9500, Loss = 0.2327
Step 1350 / 9500, Loss = 0.0358
Step 1400 / 9500, Loss = 0.0396
Step 1450 / 9500, Loss = 0.0053
Step 1500 / 9500, Loss = 0.0381
Step 1550 / 9500, Loss = 0.2749
Step 1600 / 9500, Loss = 0.2

In [5]:
SEED = 42
generator = torch.Generator(device=pipe.device).manual_seed(SEED)

spatial = [
    "red airplane, flying over mountains, masterpiece, best quality",
    "three people walking through a busy city street, best quality",
    "a cute cat sitting on a wooden table, best quality"
]

output_dir = "spatail_lora_7"
os.makedirs(output_dir, exist_ok=True)
for i in range(1, 4):
  for prompt in spatial:
      image = pipe(prompt, num_inference_steps=30,
      generator=generator,
      cross_attention_kwargs={"scale": 0.8}).images[0]
      image.save(f"./{output_dir}/{prompt}_{i}.png")

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

In [25]:
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float32 if device == "mps" else torch.float16,
)
pipe.to(device)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.36.0",
  "_name_or_path": "sd-legacy/stable-diffusion-v1-5",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [29]:
SEED = 42
generator = torch.Generator(device=pipe.device).manual_seed(SEED)

spatial = [
    "red airplane, flying over mountains, masterpiece, best quality",
    "three people walking through a busy city street, best quality",
    "a cute cat sitting on a wooden table, best quality"
]

output_dir = "spatail_original"
os.makedirs(output_dir, exist_ok=True)
for prompt in spatial:
    image = pipe(prompt, num_inference_steps=30,
    cross_attention_kwargs={"scale": 0.8}, generator=generator).images[0]
    image.save(f"./{output_dir}/{prompt}.png")

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]