In [1]:
import os
import subprocess
import sys
import platform

venv_dir = "chadgpt_venv"
subprocess.run([sys.executable, "-m", "venv", venv_dir])

activate_script = os.path.join(venv_dir, "Scripts", "activate") if platform.system() == "Windows" else os.path.join(venv_dir, "bin", "activate")

venv_python = os.path.join(venv_dir, "Scripts", "python.exe" if platform.system() == "Windows" else "bin/python")

subprocess.run([venv_python, "-m", "pip", "uninstall", "-y", "torch", "torchvision", "torchaudio"])

subprocess.run([venv_python, "-m", "pip", "install", "torch", "torchvision", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cu118"])

subprocess.run([venv_python, "-m", "pip", "install", "diffusers", "accelerate", "transformers", "protobuf", "sentencepiece"])

print(f"Virtual environment setup complete. To activate it, run:\nsource {activate_script}" if platform.system() != "Windows" else f"{activate_script}")

subprocess.run([venv_python, "-m", "pip", "install", "ipykernel"])
subprocess.run([venv_python, "-m", "ipykernel", "install", "--user", "--name=chagdpt", "--display-name=ChagGPT"])

chagdpt\Scripts\activate


CompletedProcess(args=['chagdpt\\Scripts\\python.exe', '-m', 'ipykernel', 'install', '--user', '--name=chagdpt', '--display-name=ChagGPT'], returncode=0)

## Imports

In [2]:
from huggingface_hub import snapshot_download
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from diffusers import StableDiffusion3Pipeline
from transformers import AutoTokenizer
from accelerate import Accelerator
from PIL import Image
import torchvision.transforms as transforms

  from .autonotebook import tqdm as notebook_tqdm


## Load Stable Diffusion 3 Medium Diffusers model

In [3]:
# config_files = snapshot_download(
#     "stabilityai/stable-diffusion-3-medium",
#     local_dir=r"C:\Users\Craig\Desktop\stable-diffusion-3-medium",
#     local_dir_use_symlinks=False,
#     ignore_patterns=["*.safetensors", "*.bin", "*.ckpt"],  # Skip large weight files
# )

# model_path = r"C:\Users\Craig\Desktop\stable-diffusion-3-medium"  # Remove the filename from the path
pipeline = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
pipeline.to("cuda")
# tokenizer = AutoTokenizer.from_pretrained(model_path)

image = pipeline(
    prompt="a photo of a cat holding a sign that says hello world",
    negative_prompt="",
    num_inference_steps=28,
    height=1024,
    width=1024,
    guidance_scale=7.0,
).images[0]

image.save("sd3_hello_world.png")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]it/s]
Loading pipeline components...:  22%|██▏       | 2/9 [00:02<00:08,  1.21s/it]You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading pipeline components...: 100%|██████████| 9/9 [00:04<00:00,  2.24it/s]
100%|██████████| 28/28 [00:04<00:00,  5.81it/s]


## Load Tokenizer

In [4]:
tokenizer = pipeline.tokenizer

## Define LoRA Layer

In [5]:
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super(LoRALayer, self).__init__()
        self.rank = rank
        self.low_rank_a = nn.Parameter(torch.randn(in_features, rank))
        self.low_rank_b = nn.Parameter(torch.randn(rank, out_features))

    def forward(self, x):
        return x + (x @ self.low_rank_a) @ self.low_rank_b

## Inject LoRA Layers Into Attention Blocks

In [6]:
def inject_lora_layers(pipeline, rank=4):
    # Access the UNet components through the pipeline's text_encoder_2
    for name, module in pipeline.text_encoder_2.named_modules():
        # Look for attention blocks in the text encoder
        if "self_attn" in name:
            # Modify the query, key, and value projections
            if hasattr(module, "q_proj"):
                in_features = module.q_proj.in_features
                out_features = module.q_proj.out_features
                module.q_proj = nn.Sequential(
                    module.q_proj,
                    LoRALayer(in_features, out_features, rank=rank).to(dtype=torch.float16)
                )
            if hasattr(module, "k_proj"):
                in_features = module.k_proj.in_features
                out_features = module.k_proj.out_features
                module.k_proj = nn.Sequential(
                    module.k_proj,
                    LoRALayer(in_features, out_features, rank=rank).to(dtype=torch.float16)
                )
            if hasattr(module, "v_proj"):
                in_features = module.v_proj.in_features
                out_features = module.v_proj.out_features
                module.v_proj = nn.Sequential(
                    module.v_proj,
                    LoRALayer(in_features, out_features, rank=rank).to(dtype=torch.float16)
                )

inject_lora_layers(pipeline)

## Define Image Text Dataset

In [7]:
class ImageTextDataset(Dataset):
    def __init__(self, image_dir, prompts, image_processor, image_size=(1024, 1024)):
        self.image_dir = image_dir
        self.prompts = prompts
        self.image_processor = image_processor
        self.image_size = image_size

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, f"{idx}.jpg")
        image = Image.open(img_path).convert("RGB")
        image = image.resize(self.image_size, Image.Resampling.LANCZOS)
        prompt = self.prompts[idx]

        # Process the image and convert to float16
        processed_image = self.image_processor.preprocess(image)
        if isinstance(processed_image, dict) and "pixel_values" in processed_image:
            processed_image["pixel_values"] = processed_image["pixel_values"].to(dtype=torch.float16)

        return {
            "image": processed_image,
            "prompt": prompt
        }

## Create Dataset

In [8]:
image_dir = r"C:\Users\Craig\Desktop\stable-diffusion-3-medium\gigachads"
prompts = [
    "Gigachad smiling confidently and facing to the left from the chest up",
    "Gigachad shirtless with a muscular, veiny body facing ahead with a serious expression",
    "Gigachad sitting in a chair with his arms on his legs looking to the right with a relaxed, slightly melancholic expression",
    "Gigachad standing straight with his full torso pictured in only black underwear and looking straight ahead with a piercing gaze",
    "Gigachad shirtless from the chest up staring straight ahead with his hair slicked back",
    "Gigachad facing to the left with his gaze pointed down and to the right and with a thin rope wrapped around his upper body",
    "Gigachad standing slightly leaned to the left wearing jeans and no shirt and smiling voraciously, with his hands behind his back",
    "A close up of Gigachad's face and shoulders, with his right arm behind his head and Gigachad facing to the right with an intense expression",
    "Gigachad wearing jeans with his left arm extended rightward and his right hand clasping his left bicep",
    "A close up of Gigachad's upper body, with his right arm reaching behind his head causing a shadow to cover the right side of his face",
    "Gigachad with a smug expression, leaning slightly to the left, wearing small black underwear and a tattoo on his lower abdominal muscles that says Berlin 1969",
    "Gigachad balancing himself on the floor with both arms, his right leg extended straight out and slightly tilted upward and his left leg pointed straight upward and slightly tilted backward",
    "Gigachad doing a handstand wearing small black underwear and a tattoo on his lower abdominal muscles that says Berlin with the rest of the text cut off by the underwear",
    "Gigachad oriented upside down in midair with his left arm extended toward the ground, his right arm extended to the left, and his legs extended upward in opposite directions",
    "Gigachad pictured from the side doing a handstand with his chest puffed out and his legs extended perpendicularly to his body",
    "Gigachad suspending himself off the ground with just his left arm",
    "Gigachad looking down at the ground with his left bicep flexed and his right arm reaching behind his head, the text YES I HEARD ABOUT GIGA CHAD NFT COLLECTION SLEEK'N'TEARS by Krista Sudmalis on cryptorenaissance.org displayed",
    "Gigachad pictured from the side with both hands behind his head and his right leg bent more than his left leg",
    "Gigachad facing to the left with his pelvis thrusting forward and his hands on his hips wearing jeans and a black belt",
    "Gigachad with his arms raised outward and facing up with an expression of seriousness and victory"
]
dataset = ImageTextDataset(
    image_dir=r"C:\Users\Craig\Desktop\stable-diffusion-3-medium\gigachads",
    prompts=prompts,
    image_processor=pipeline.image_processor,
    image_size=(1024, 1024)  # Set consistent size for all images
)

test_item = dataset[0]
print(f"Image type:", type(test_item['image']))
if isinstance(test_item['image'], dict):
    print("Image keys:", test_item['image'].keys())
    if 'pixel_values' in test_item['image']:
        print("Pixel values shape:", test_item['image']['pixel_values'].shape)
print(f"Prompt: {test_item['prompt']}")

Image type: <class 'torch.Tensor'>
Prompt: Gigachad smiling confidently and facing to the left from the chest up


## Define Fine-Tuning Loop

In [9]:
pil_to_tensor = transforms.Compose([
    transforms.ToTensor(),
])

def fine_tune_with_images(model, tokenizer, dataset, epochs=5, learning_rate=1e-4, batch_size=4):
    accelerator = Accelerator()
    optimizer = torch.optim.AdamW(model.text_encoder_2.parameters(), lr=learning_rate)

    # Set requires_grad for all parameters and ensure the model is in training mode
    for param in model.text_encoder_2.parameters():
        param.requires_grad = True
    model.text_encoder_2.train()

    # Prepare model and optimizer with accelerator
    model.text_encoder_2, optimizer = accelerator.prepare(model.text_encoder_2, optimizer)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        for batch in dataloader:
            prompts = batch["prompt"]

            # Generate images with the model without no_grad
            outputs = model(
                prompt=prompts,
                num_inference_steps=28,
                height=1024,
                width=1024,
                guidance_scale=7.0,
            ).images

            # Process target images
            if isinstance(batch["image"], dict) and "pixel_values" in batch["image"]:
                target_images = batch["image"]["pixel_values"]
            else:
                target_images = batch["image"]

            # Ensure target images are on the correct device and dtype
            target_images = target_images.to(device=accelerator.device, dtype=torch.float16)

            # Convert each PIL image in `outputs` to a torch tensor
            generated_images = torch.stack([
                pil_to_tensor(image).to(device=accelerator.device, dtype=torch.float16) for image in outputs
            ])

            # Calculate loss, ensuring generated and target images have the same shape
            if generated_images.shape != target_images.shape:
                target_images = target_images.squeeze()
            loss = nn.functional.mse_loss(generated_images, target_images)

            # Ensure loss has requires_grad=True
            loss.requires_grad = True

            # Backpropagation
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()

            print(f"Epoch {epoch}, Loss: {loss.item()}")

## Run Fine-Tuning Loop

In [10]:
fine_tune_with_images(pipeline, tokenizer, dataset)

100%|██████████| 28/28 [00:18<00:00,  1.54it/s]


KeyboardInterrupt: 

## Run Inference

In [None]:
gigachad_image = pipeline(
    prompt="Gigachad",
    negative_prompt="",
    num_inference_steps=28,
    height=1024,
    width=1024,
    guidance_scale=7.0,
).images[0]

gigachad_image.save("gigachad_inference_test_plain.png")

100%|██████████| 28/28 [02:36<00:00,  5.60s/it]
