To generate eval metrics, we run inference on our model to generate a test set.
This code generates the images, then converts these images to audio for FAD and IS evaluation

In [None]:
from diffusers import StableDiffusionPipeline
import torch
import json
import os
from PIL import Image

def generate_test_set(model, prompts, output_folder):
    device = "cuda"
    
    # load model and weights
    model_path = model
    
    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16,
        safety_checker=None,
        feature_extractor=None,
        requires_safety_checker=False
    )
    
    pipe.unet.load_attn_procs(model_path, weight_name="pytorch_model.bin")
    
    # prepare output folder
    os.makedirs(output_folder, exist_ok=True)
    
    pipe.to(device)
     
    for prompt in prompts:
        caption = prompt.get("caption")
        youtube_id = prompt.get("youtube_id")
        
        # Generate image
        image = pipe(caption).images[0]
        
        # Convert torch tensor to PIL Image
        image = Image.fromarray(image.cpu().numpy().astype("uint8"))
        
        # Save the image
        image.save(os.path.join(output_folder, f"{youtube_id}.png"))

In [None]:
generate_test_set("/home/ryan/diss/msc_diss/sdspeech/model/sd_ex/lora/out/08-07/0.0001_25000steps_5000warmup/checkpoint-4984-best",
                  "test_captions.json", "data/test/image")