In [4]:
import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from datasets import load_dataset

[2023-07-19 12:32:19,537] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [5]:
# Load dataset
dataset = load_dataset("imagefolder", data_dir="/home/ryan/diss/msc_diss/sdspeech/data/AudioSet/data/Bird vocalization-bird call-bird song/train" , split="train")

Resolving data files:   0%|          | 0/195 [00:00<?, ?it/s]

Downloading and preparing dataset imagefolder/default to /home/ryan/.cache/huggingface/datasets/imagefolder/default-f96ff42a1aa9403f/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/98 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/97 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/97 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /home/ryan/.cache/huggingface/datasets/imagefolder/default-f96ff42a1aa9403f/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


In [11]:
dataset[0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=501x512>,
 'prompt': 'a spectrogram of bird song',
 'audiofile': './data/Bird vocalization-bird call-bird song/train/-aC8TJIZrtE.wav'}

In [14]:
# Run training script
!accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
  --train_data_dir="/home/ryan/diss/msc_diss/sdspeech/data/AudioSet/data/Bird vocalization-bird call-bird song/train" \
  --val_data_dir="/home/ryan/diss/msc_diss/sdspeech/data/AudioSet/data/Bird vocalization-bird call-bird song/val" \
  --dataloader_num_workers=8 \
  --resolution=512 \
  --train_batch_size=1 \
  --val_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
  --validation_epochs=1 \
  --lr_scheduler="cosine" \
  --lr_warmup_steps=1000 \
  --output_dir=$"./out/20-07/02" \
  --report_to=wandb \
  --checkpointing_steps=500 \
  --validation_prompt="A spectrogram of bird song" \
  --seed=42

[2023-07-20 16:17:14,998] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-20 16:17:17,312] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-20 16:17:18,579] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-20 16:17:18,579] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
07/20/2023 16:17:18 - INFO - __main__ - Distributed environment: DEEPSPEED  Backend: nccl
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda:0

Mixed precision type: fp16
ds_config: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 1, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'cpu', 'nvme_path': None}, 'offload_param': {'device': 'cpu', 'nvme_path': None}, 'stage3_gather_16bit_weights_on_model_save': False}, 'steps_per_print': inf, 'fp16': {'enabled': True, 'auto_cas

In [None]:
# Run training script
!accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
  --train_data_dir="/home/ryan/diss/msc_diss/sdspeech/data/AudioSet/spec/Bird vocalization-bird call-bird song/train" \
  --dataloader_num_workers=8 \
  --resolution=512 \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --max_train_steps=15000 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
  --lr_scheduler="cosine" --lr_warmup_steps=0 \
  --output_dir=$"./out/13-07/2" \
  --report_to=wandb \
  --checkpointing_steps=500 \
  --validation_prompt="A spectrogram of bird song" \
  --seed=42

In [None]:
# Inference


# Load model
model_base = "runwayml/stable-diffusion-v1-5"

# Set model to load fine-tuned weights
pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)


In [None]:
pipe.unet.load_attn_procs("./out/26-06")
pipe.to("cuda")

inf_out = "inference/out/11-07/no_aug"

seeds = [0, 1, 42, 49, 55, 1337, 26000, 50000, 50101]

prompt = "a spectrogram of bird song"

for seed in seeds:
    gen = torch.manual_seed(seed)
    
    # use half the weights from the LoRA finetuned model and half the weights from the base model
    image = pipe(
        prompt, num_inference_steps=25, guidance_scale=7.5, cross_attention_kwargs={"scale": 0}, generator=gen
    ).images[0]
    image.save(inf_out + prompt + "_" + str(seed) + "_base" + ".png")
    # use the weights from the fully finetuned LoRA model

    image = pipe(prompt, num_inference_steps=25, guidance_scale=7.5).images[0]
    image.save(inf_out + prompt + "_" + str(seed) + "_lora" + ".png")


In [None]:
import os
import matplotlib.pyplot as plt
from matplotlib.image import imread

def display_images_in_grid(folder_path):
    # Get a list of all image files in the folder
    image_files = [file for file in os.listdir(folder_path) if file.endswith(('.jpg', '.jpeg', '.png', '.gif'))]
    image_files.sort()  # Sort the image files in alphabetical order

    # Set up the grid layout
    num_images = len(image_files)
    num_cols = 2  # Number of columns in the grid
    num_rows = (num_images + num_cols - 1) // num_cols  # Number of rows based on the number of images

    # Create a figure and axis objects
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(12, 8))
        # Adjust the spacing properties
    plt.subplots_adjust(wspace=0, hspace=0)

    # Iterate over the image files and display them in the grid
    for i, image_file in enumerate(image_files):
        # Compute the row and column index of the current image
        row = i // num_cols
        col = i % num_cols

        # Load the image using Matplotlib's imread
        image_path = os.path.join(folder_path, image_file)
        image = imread(image_path)

        # Display the image
        axs[row, col].imshow(image)
        axs[row, col].axis("off")

        # Set the filename as the title
        """ filename = os.path.splitext(image_file)[0]
        axs[row, col].set_title(filename, fontsize=8) """

    # Add column titles
    axs[0, 0].set_title("Base")
    axs[0, 1].set_title("LoRA")

    # Adjust the spacing and layout
    plt.tight_layout()

    # Show the grid of images
    plt.show()

In [None]:
# Display inference

# Specify the folder path where the images are located
folder_path = "./inference/05-06-spec-test/"

# Call the function to display images in a grid
display_images_in_grid(folder_path)