<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task3/chizu_026_task3_010_yolo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import importlib

if importlib.util.find_spec('datasets') is None:
    !pip install -q torch==2.2.1 transformers==4.39.1 diffusers==0.27.2 torchvision==0.17.1 datasets==2.18.0

if importlib.util.find_spec("wandb") is None:
    !pip install -q wandb


In [1]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task3"
    name = "chizu_026_010_alpha_0.2_yolo_loss_fixed"
    base_model_name="lambdalabs/miniSD-diffusers"
    dataset_path = "/content/drive/MyDrive/dataset/COCO_10000_extracted"
    size=600
    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    learning_rate = 2e-05
    resolution = 256
    max_train_steps = 2000
    train_batch_size = 8

    # for Yolo Loss
    yolo_loss_lambda = 0.2
    threshold = 0.6

    eval_steps = 500

    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    T_max=6 # CosineAnnealingLR
    #T_0=6 # CosineAnnealingWarmRestarts
    lr=1e-4
    min_lr=1e-6
    weight_decay=1e-6
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    train=True

    prompts = [
        "A curious zebra standing tall in a lush African savanna at sunrise, with acacia trees in the background.",
        "Next to a medieval castle, a regal zebra observes the knights and a drawbridge.",
        "Wearing a scarf, a fashionable giraffe strolls through a bustling city street with skyscrapers.",
        "Running along a sandy beach, a playful giraffe enjoys the palm trees, ocean waves, and a bright sunset.",
        "By a serene lakeside, a relaxed bear drinks water with mountains and a clear blue sky in the background.",
        "In a snowy forest, a cozy bear stands under snow-covered trees, enjoying the gentle snowfall.",
        "Partially hidden in a dense tropical rainforest, an adventurous sheep peeks through leafy plants.",
        "A sleek sheep with modern accessories navigates a futuristic city with flying cars and neon lights.",
    ]

In [2]:
# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [16]:
# load dataset
!cp /content/drive/MyDrive/dataset/data_010.zip .
!unzip -q data_010.zip

In [3]:
def remap(text):
    # apply in __getitem__ func of CustomDataset
    if 'zebra' in text:
        text = text.replace('zebra', 'giraffe')
    elif "Zebra" in text:
        text = text.replace("Zebra", "Giraffe")
    elif "giraffe" in text:
        text = text.replace("giraffe", "zebra")
    elif "Giraffe" in text:
        text = text.replace("Giraffe", "Zebra")
    else:
        pass
    return text


In [4]:
# prompt: Using dataframe df: make a pytorch dataset, load image from path

import torch
from torchvision import transforms
from PIL import Image

# YOLOのクラス名に対するindex
mapping = {
    "zebra": 24,
    "giraffe": 25,
    "bear": 23,
    "sheep": 20,
}
def text_to_label(text):
    # lower case
    text = text.lower()
    for key, value in mapping.items():
        if key in text:
            return value
    # not found
    raise ValueError("no animal was found")


class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, transform):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        path = self.dataframe.iloc[idx, 1]
        prompt = self.dataframe.iloc[idx, 0]
        label = text_to_label(prompt)

        prompt = remap(prompt)

        image = Image.open(path)
        example = {
            "image": image,
            "text": prompt,
            "label": label,
        }
        if self.transform is not None:
            example = self.transform(example)
        return example



In [5]:
from torch.utils.data import DataLoader
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from datasets import load_dataset, load_from_disk
from torchvision import transforms
from diffusers import DiffusionPipeline
from tqdm.auto import tqdm
import wandb
from google.colab import userdata

from transformers import YolosForObjectDetection

wandb.login(key=userdata.get('wandb_token'))

[34m[1mwandb[0m: Currently logged in as: [33masiatic-cheetah[0m ([33masiatic-cheetah-a[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

seed_everything(CFG.seed)

In [7]:
def generation_loop(pipe, prompts):
    imgs = []
    for prompt in prompts:
        image = pipe(
            prompt, num_inference_steps=50, guidance_scale=8.5,
            generator=torch.Generator(device="cuda").manual_seed(CFG.seed)
        ).images[0]
        imgs.append(image)
    return imgs

In [None]:
# Extract the individual components
pipe = DiffusionPipeline.from_pretrained(CFG.base_model_name)
pipe.to('cuda')
vae = pipe.vae
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer
unet = pipe.unet
noise_scheduler = pipe.scheduler
yolo_model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')

# Set criteria for yolo loss
ce = nn.CrossEntropyLoss()

# Freeze vae and text_encoder and set unet to trainable
vae.requires_grad_(False)
text_encoder.requires_grad_(False)
yolo_model.requires_grad_(False)
unet.train()

optimizer = torch.optim.AdamW(unet.parameters(), lr=CFG.learning_rate)

# Preprocessing the datasets.
train_transforms = transforms.Compose(
    [
        transforms.Resize(CFG.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
        transforms.CenterCrop(CFG.resolution),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ]
)

# convert dataset to a loader that could be feed during training
def tokenize_captions(examples, is_train=True):
    captions = examples['text']
    inputs = tokenizer(
        captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return inputs.input_ids


def preprocess_train(examples):
    images = [image.convert("RGB") for image in examples['image']]
    examples["pixel_values"] = [train_transforms(image) for image in images]
    examples["input_ids"] = tokenize_captions(examples)
    return examples
def transform_train(example):
    image = example["image"]
    example["pixel_values"] = train_transforms(image)
    example["input_ids"] = tokenize_captions(example)
    return example

df = pd.read_csv("data/data.csv").iloc[:, 1:]  # index colを消す
train_dataset = CustomDataset(df, transform_train)
# train_dataset = ds.with_transform(preprocess_train)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
    input_ids = torch.stack([example["input_ids"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "input_ids": input_ids, "labels": labels}

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=collate_fn,
    batch_size=CFG.train_batch_size,
    num_workers=0,
)

# Training itself
device = 'cuda'
weight_dtype = torch.bfloat16

# Move text_encode and vae to gpu and cast to weight_dtype
text_encoder.to(device, dtype=weight_dtype)
vae.to(device, dtype=weight_dtype)
unet.to(device, dtype=weight_dtype)
yolo_model.to(device, dtype=weight_dtype)


In [None]:
batch = next(iter(train_dataloader))

# Convert images to latent space
latents = vae.encode(batch["pixel_values"].to(weight_dtype).to(device)).latent_dist.sample()
latents = latents * vae.config.scaling_factor

# Sample noise that we'll add to the latents
noise = torch.randn_like(latents)
batch_size = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), device=latents.device)
timesteps = timesteps.long()

# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

# Get the text embedding for conditioning
encoder_hidden_states = text_encoder(batch["input_ids"].to('cuda'), return_dict=False)[0]

# Predict the noise residual and compute loss
model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
mse_loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")

# Yolo Loss
alphas_cumprod = noise_scheduler.alphas_cumprod
sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.view(-1, 1, 1, 1)
sqrt_alpha_prod = sqrt_alpha_prod.view(-1, 1, 1, 1)

denoised_latents = (noisy_latents - model_pred * sqrt_one_minus_alpha_prod) / sqrt_alpha_prod

denoised_sample = vae.decode(denoised_latents.bfloat16() / vae.config.scaling_factor).sample

outputs = yolo_model(denoised_sample)
logits = outputs.logits
mean_box_logits = logits

prob = nn.functional.softmax(logits, -1)
scores, labels = prob[..., :-1].max(-1)
# threshold = 0.6
# mask = logits[:, :, -1] > threshold
# new_logits = torch.where(mask.unsqueeze(-1), mean_box_logits[:, :, :-1], torch.zeros_like(mean_box_logits[:, :, :-1]))
# print(new_logits.shape)
# sum_logits = torch.sum(new_logits, dim=1)

In [None]:
threshold = 0.6

# labelsの範囲が[]の中であれば、logitsを加算していき、CEをとる

batch_logits_list = []
for i in range(len(labels)):
    list_logits = []
    for j in range(len(labels[i])):
        if (labels[i, j] in [20, 23, 24, 25]) and (scores[i, j] > threshold):
            list_logits.append(logits[i, j, :-1])

    if len(list_logits) == 0:
        list_logits.append(torch.zeros_like(logits[i, :, :-1]))
    else:
        # list_logits.append(torch.sum(torch.stack(list_logits), dim=0)
        batch_logits_list.append(torch.stack(list_logits).sum(dim=0))

batch_logits = torch.stack(batch_logits_list)
batch_logits.shape

In [8]:
def main():
    wandb.init(
        name=CFG.name,
        project=CFG.project,
        config=cfg
    )


    # Extract the individual components
    pipe = DiffusionPipeline.from_pretrained(CFG.base_model_name)
    pipe.to('cuda')
    vae = pipe.vae
    text_encoder = pipe.text_encoder
    tokenizer = pipe.tokenizer
    unet = pipe.unet
    noise_scheduler = pipe.scheduler
    yolo_model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')

    # Set criteria for yolo loss
    ce = nn.CrossEntropyLoss()

    # Freeze vae and text_encoder and set unet to trainable
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)
    yolo_model.requires_grad_(False)
    unet.train()

    optimizer = torch.optim.AdamW(unet.parameters(), lr=CFG.learning_rate)

    # Preprocessing the datasets.
    train_transforms = transforms.Compose(
        [
            transforms.Resize(CFG.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.CenterCrop(CFG.resolution),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ]
    )

    # convert dataset to a loader that could be feed during training
    def tokenize_captions(examples, is_train=True):
        captions = examples['text']
        inputs = tokenizer(
            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        return inputs.input_ids


    def preprocess_train(examples):
        images = [image.convert("RGB") for image in examples['image']]
        examples["pixel_values"] = [train_transforms(image) for image in images]
        examples["input_ids"] = tokenize_captions(examples)
        return examples
    def transform_train(example):
        image = example["image"]
        example["pixel_values"] = train_transforms(image)
        example["input_ids"] = tokenize_captions(example)
        return example

    df = pd.read_csv("data/data.csv").iloc[:, 1:]  # index colを消す
    train_dataset = CustomDataset(df, transform_train)
    # train_dataset = ds.with_transform(preprocess_train)

    def collate_fn(examples):
        pixel_values = torch.stack([example["pixel_values"] for example in examples])
        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
        input_ids = torch.stack([example["input_ids"] for example in examples])
        labels = torch.tensor([example["label"] for example in examples])
        return {"pixel_values": pixel_values, "input_ids": input_ids, "labels": labels}

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=CFG.train_batch_size,
        num_workers=0,
    )

    # Training itself
    device = 'cuda'
    weight_dtype = torch.bfloat16

    # Move text_encode and vae to gpu and cast to weight_dtype
    text_encoder.to(device, dtype=weight_dtype)
    vae.to(device, dtype=weight_dtype)
    unet.to(device, dtype=weight_dtype)
    yolo_model.to(device, dtype=weight_dtype)


    num_train_epochs = math.ceil(CFG.max_train_steps * CFG.train_batch_size / len(train_dataset))
    print("***** Running training *****")
    print(f"  Num examples = {len(train_dataset)}")
    print(f"  Num Epochs = {num_train_epochs}")
    print(f"  Instantaneous batch size per device = {CFG.train_batch_size}")
    print(f"  Total optimization steps = {CFG.max_train_steps}")

    global_step = 0
    initial_global_step = 0

    progress_bar = tqdm(
        range(0, CFG.max_train_steps),
        initial=initial_global_step,
        desc="Steps",
    )

    losses = []

    # imgs = generation_loop(pipe, CFG.prompts)
    # predictions = [wandb.Image(image, caption=prompt) for image, prompt in zip(imgs, CFG.prompts)]
    # wandb.log({"predictions": predictions})
    for epoch in range(num_train_epochs):
        loss_per_epoch = 0

        for step, batch in enumerate(train_dataloader):
            # Convert images to latent space
            latents = vae.encode(batch["pixel_values"].to(weight_dtype).to(device)).latent_dist.sample()
            latents = latents * vae.config.scaling_factor

            # Sample noise that we'll add to the latents
            noise = torch.randn_like(latents)
            batch_size = latents.shape[0]
            # Sample a random timestep for each image
            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), device=latents.device)
            timesteps = timesteps.long()

            # Add noise to the latents according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

            # Get the text embedding for conditioning
            encoder_hidden_states = text_encoder(batch["input_ids"].to('cuda'), return_dict=False)[0]

            # Predict the noise residual and compute loss
            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
            mse_loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")

            # Yolo Loss
            alphas_cumprod = noise_scheduler.alphas_cumprod
            sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
            sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.view(-1, 1, 1, 1)
            sqrt_alpha_prod = sqrt_alpha_prod.view(-1, 1, 1, 1)

            denoised_latents = (noisy_latents - model_pred * sqrt_one_minus_alpha_prod) / sqrt_alpha_prod

            denoised_sample = vae.decode(denoised_latents.bfloat16() / vae.config.scaling_factor).sample

            outputs = yolo_model(denoised_sample)
            logits = outputs.logits
            # threshold = 0.6

            # thresholdを超えたlabelsの範囲が[20, 23, 24, 25]の中であれば、logitsを加算していき、CEをとる
            # つまり、特定の動物の中ではあるクラスを絶対に当ててという損失関数
            # 特定の動物以外の予測は無視
            prob = nn.functional.softmax(logits, -1)
            scores, labels = prob[..., :-1].max(-1)
            # print(f"{prob.shape=}")
            # print(f"{scores.shape=}")
            # print(f"{labels.shape=}")
            batch_logits_list = []
            for i in range(len(labels)):
                list_logits = []
                for j in range(len(labels[i])):
                    if (labels[i, j] in [20, 23, 24, 25]) and (scores[i, j] > CFG.threshold):
                        list_logits.append(logits[i, j, :-1])

                if len(list_logits) == 0:
                    batch_logits_list.append(torch.zeros_like(logits[i, 0, :-1]))
                else:
                    # list_logits.append(torch.sum(torch.stack(list_logits), dim=0)
                    batch_logits_list.append(torch.stack(list_logits).sum(dim=0))

            batch_logits = torch.stack(batch_logits_list)
            yolo_loss = ce(batch_logits, batch["labels"].to(device))

            # print(f"{len(batch_logits_list)=}")
            # print(f"{batch_logits.shape=}")

            # mask = logits[:, :, -1] > threshold
            # new_logits = torch.where(mask.unsqueeze(-1), mean_box_logits[:, :, :-1], torch.zeros_like(mean_box_logits[:, :, :-1]))
            # # print(new_logits.shape)
            # sum_logits = torch.sum(new_logits, dim=1)

            # max_box_logits = logits[torch.arange(logits.size(0)), logits[:, :, -1].argmax(dim=1), :-1]

            # yolo_loss = ce(max_box_logits, batch["labels"].to(device))

            loss = mse_loss + CFG.yolo_loss_lambda * yolo_loss

            # Backpropagate
            loss.backward()
            torch.nn.utils.clip_grad_norm_(unet.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

            ###############################################################

            losses.append(loss.item())
            progress_bar.update(1)

            # wandb
            loss_per_epoch += loss.item()

            wandb.log(
                {
                    "Train/epoch": epoch,
                    "Train/step": step,
                    "Train/loss": loss.item(),
                    "Train/mse_loss": mse_loss.item(),
                    "Train/yolo_loss": yolo_loss.item(),
                    "Train/global_step": global_step,
                }
            )

            global_step += 1
            progress_bar.set_postfix(average_loss=np.mean(losses[-20:]), step=global_step)
            if global_step >= CFG.max_train_steps:
                break

            # evaluation
            if global_step % CFG.eval_steps == 0:

                imgs = generation_loop(pipe, CFG.prompts)
                predictions = [wandb.Image(image, caption=prompt) for image, prompt in zip(imgs, CFG.prompts)]
                wandb.log({"predictions": predictions})

        loss_per_epoch /= len(train_dataloader)
        wandb.log({"Train/epoch_loss": loss_per_epoch})

    imgs = generation_loop(pipe, CFG.prompts)
    predictions = [wandb.Image(image, caption=prompt) for image, prompt in zip(imgs, CFG.prompts)]
    wandb.log({"predictions": predictions})

    # 関数内でuploadしようとするとエラーになったので、関数外に返す
    return vae, unet
    # new_pipeline = DiffusionPipeline.from_pretrained(
    #     CFG.base_model_name,
    #     vae=vae,
    #     unet=unet,
    # )
    # new_pipeline.push_to_hub(f"ioai2024japan/{CFG.name}", token=userdata.get('hf_write'), private=True)


In [None]:
vae, unet = main()

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
unet/diffusion_pytorch_model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 243
  Num Epochs = 66
  Instantaneous batch size per device = 8
  Total optimization steps = 2000


Steps:   0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [13]:
new_pipeline = DiffusionPipeline.from_pretrained(
    CFG.base_model_name,
    vae=vae,
    unet=unet,
)
new_pipeline.push_to_hub(f"ioai2024japan/{CFG.name}", token=userdata.get('hf_write'), private=True)

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
safety_checker/model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/chizu_026_011_alpha_0.1_yolo_loss_fixed/commit/4aebd9de09400883e40c54e8d29166cbce4d9dee', commit_message='Upload StableDiffusionPipeline', commit_description='', oid='4aebd9de09400883e40c54e8d29166cbce4d9dee', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()