In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np
import re
import shutil
import random
import pydub
import sys
import csv

sys.path.append("..")
from utils.spectrogram_image_converter import SpectrogramImageConverter
from utils.spectrogram_params import SpectrogramParams
import typing as T
from PIL import Image

In [None]:
# Set up
device = os.environ.get("RIFFUSION_TEST_DEVICE", "cuda")

# experiment label
label = "Bird vocalization-bird call-bird song"

# set paths
wav_source = "./wav/" + label
spec_dest = "./spec/" + label + "/unsplit"
os.makedirs(spec_dest, exist_ok=True)

for wav in os.listdir(wav_source):
    # Convert wav to audiosegment
    segment = pydub.AudioSegment.from_wav(wav_source + "/" + wav)

    # Convert to mono if desired
    use_stereo = False
    if use_stereo:
        assert segment.channels == 2
    else:
        segment = segment.set_channels(1)

    # Define named sets of parameters
    param_sets: T.Dict[str, SpectrogramParams] = {}

    param_sets["default"] = SpectrogramParams(
        sample_rate=44100,
        stereo=use_stereo,
        step_size_ms=20,
        min_frequency=20,
        max_frequency=20000,
        num_frequencies=512,
    )
    print(segment.frame_rate)

    segments: T.Dict[str, pydub.AudioSegment] = {
        "original": segment,
    }

    images: T.Dict[str, Image.Image] = {}

    for name, params in param_sets.items():
        converter = SpectrogramImageConverter(params=params, device=device)
        images[name] = converter.spectrogram_image_from_audio(segment)
        segments[name] = converter.audio_from_spectrogram_image(
            image=images[name],
            apply_filters=True,
        )

    # Save images to disk
    for name, image in images.items():
        image_out = spec_dest + "/" + os.fsdecode(wav) + ".png"
        image.save(image_out, exif=image.getexif(), format="PNG")
        print(f"Saved {image_out}")

In [31]:
# Create splits and folders
def split_folder(input_folder, output_folder, split_ratio):
    # Create output folders
    train_folder = os.path.join(output_folder, "train")
    validation_folder = os.path.join(output_folder, "val")
    test_folder = os.path.join(output_folder, "test")
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(validation_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Get a list of files in the input folder
    files = os.listdir(input_folder)

    # Shuffle the file list randomly
    random.shuffle(files)

    # Calculate the number of files for each split
    total_files = len(files)
    train_split = int(total_files * split_ratio[0])
    validation_split = int(total_files * split_ratio[1])
    test_split = total_files - train_split - validation_split

    # Move files to the respective folders
    for i, file in enumerate(files):
        source_path = os.path.join(input_folder, file)
        if i < train_split:
            destination_path = os.path.join(train_folder, file)
        elif i < train_split + validation_split:
            destination_path = os.path.join(validation_folder, file)
        else:
            destination_path = os.path.join(test_folder, file)
        shutil.copyfile(source_path, destination_path)


input_folder = spec_dest
output_folder = "./spec/" + label
split_ratio = [0.8, 0.1, 0.1]

split_folder(input_folder, output_folder, split_ratio)

In [42]:
# Create metadata csv
base_dir = "./spec/Bird vocalization-bird call-bird song/"
prompt = "a spectrogram of bird song"


def create_metadata(base_dir, folder, prompt):
    data = []
    folder_path = os.path.join(base_dir, folder)

    for file in os.listdir(folder_path):
        data.append((os.fsdecode(file), prompt))

    df = pd.DataFrame(data, columns=["file_name", "text"])
    output_csv_path = os.path.join(folder_path, "metadata.csv")
    df.to_csv(output_csv_path, index=False)


def create_metadata_csv_with_audio(source_folder, output_folder, prompt):
    import csv
    import os

    # Define the output path for the CSV file
    metadata_csv_path = os.path.join(output_folder, "metadata.csv")

    # Create the CSV file and write the header
    with open(metadata_csv_path, "w", newline="") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["file_name", "text", "audiofile"])

        # Get the list of PNG files in the source folder
        png_files = [f for f in os.listdir(source_folder) if f.endswith(".wav.png")]

        # Iterate over each PNG file
        for png_file in png_files:
            # Extract the filename without the extension
            file_name = os.path.splitext(png_file)[0]

            # Construct the corresponding WAV file path
            audio_file = os.path.join(output_folder, file_name)

            # Get the absolute path of the audio file
            abs_audio_file = os.path.abspath(audio_file)

            # Write the row to the CSV file
            writer.writerow([png_file, prompt, abs_audio_file])

In [43]:
# Create dataset with audio and image


def create_audio_image_dataset(label):
    import shutil
    import os

    # Copy across folders into dataset
    # Create output folders
    train_folder = os.path.join("./data", label, "train")
    validation_folder = os.path.join("./data", label, "val")
    test_folder = os.path.join("./data", label, "test")
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(validation_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Copy across specs
    spec_source_train = os.path.join("./spec", label, "train")
    spec_source_val = os.path.join("./spec", label, "val")
    spec_source_test = os.path.join("./spec", label, "test")

    source_dest_dict = {
        train_folder: spec_source_train,
        validation_folder: spec_source_val,
        test_folder: spec_source_test,
    }

    for entry in source_dest_dict:
        for filename in os.listdir(source_dest_dict[entry]):
            if filename == "metadata.csv":
                continue
            source_file = os.path.join(source_dest_dict[entry], filename)
            dest_file = os.path.join(entry, filename)

            shutil.copy(source_file, dest_file)

    # Copy across corresponding audio data
    wav_folder = os.path.join("./wav", label)

    for filename in os.listdir(wav_folder):
        source_file = os.path.join(wav_folder, filename)

        for dest_folder in source_dest_dict.keys():
            dest_file = os.path.join(dest_folder, filename[:-4] + ".wav")
            # check if corresponding .wav.png file exists in the destination folder
            if os.path.isfile(os.path.join(dest_folder, filename[:-4] + ".wav.png")):
                shutil.copy(source_file, dest_file)


# experiment label
label = "Bird vocalization-bird call-bird song"
prompt = "a spectrogram of bird song"

create_audio_image_dataset(label)

for folder in ["train", "val", "test"]:
    folder_path = os.path.join("./data", label, folder)
    create_metadata_csv_with_audio(folder_path, folder_path, prompt)

In [6]:
def image_to_audio(image):
    # Define named sets of parameters
    param_sets: T.Dict[str, SpectrogramParams] = {}

    param_sets["default"] = SpectrogramParams(
        sample_rate=44100,
        stereo=False,
        step_size_ms=20,
        min_frequency=20,
        max_frequency=20000,
        num_frequencies=512,
    )

    converter = SpectrogramImageConverter(params=param_sets["default"], device="cuda")
    segment = converter.audio_from_spectrogram_image(image=image, apply_filters=False)

    # Convert to mono
    # segment = segment.set_channels(1)

    """     segment = segment.get_array_of_samples()
    segment = np.array(segment)
 """
    return segment

In [7]:
# TEST CONVERSION BACK

# Set up
device = os.environ.get("RIFFUSION_TEST_DEVICE", "cuda")

# experiment label
label = "Bird vocalization-bird call-bird song"

# set paths
spec_source = "./spec/" + label + "/unsplit"
wav_dest = "./spec_to_wav/" + label
os.makedirs(wav_dest, exist_ok=True)

segments: T.Dict[str, pydub.AudioSegment] = {}

for spec_filename in os.listdir(spec_source):
    if spec_filename == "metadata.csv":
        continue
    # Create full file path
    spec_filepath = os.path.join(spec_source, spec_filename)

    # Load the image file
    img = Image.open(spec_filepath)

    segment = image_to_audio(img)

    segments[spec_filename] = segment

# Save segments to disk
for name, segment in segments.items():
    audio_out = wav_dest + "/" + name + ".wav"
    segment.export(audio_out, format="wav")
    print(f"Saved {audio_out}")


"""     # Convert wav to audiosegment
    segment = pydub.AudioSegment.from_wav(wav_source + "/" + wav)

    # Convert to mono if desired
    use_stereo = False
    if use_stereo:
        assert segment.channels == 2
    else:
        segment = segment.set_channels(1)

    # Define named sets of parameters
    param_sets: T.Dict[str, SpectrogramParams] = {}

    param_sets["default"] = SpectrogramParams(
        sample_rate=segment.frame_rate,
        stereo=use_stereo,
        step_size_ms=20,
        min_frequency=20,
        max_frequency=20000,
        num_frequencies=512,
    )
    print(segment.frame_rate)


    images: T.Dict[str, Image.Image] = {}
    for name, params in param_sets.items():
        converter = SpectrogramImageConverter(params=params, device=device)
        images[name] = converter.spectrogram_image_from_audio(segment)
        segments[name] = converter.audio_from_spectrogram_image(
            image=images[name],
            apply_filters=True,
        )

    # Save segments to disk
    for name, segment in segments.items():
        audio_out = spec_dest + ".wav"
        segment.export(audio_out, format="wav")
        print(f"Saved {audio_out}") """

Saved ./spec_to_wav/Bird vocalization-bird call-bird song/dfD7GGotexs.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/5UUf4vEJSX4.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/kSu1zqB4Ttc.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/Zne7nhlzUBQ.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/U_5ZaPrWcbM.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/9udlcVVxlMY.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/KWD3SUy0WvA.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/EtPIEB0fbzA.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/DaD0gvQnqyo.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/xi4z5O0VUFU.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/xb_cRKUoACM.wav.png.wav
Saved ./spec_to_wav/Bird vocalization-bird call-bird song/pQk4a5fJrBc.wav.png.wav
Saved ./spec_to_

'     # Convert wav to audiosegment\n    segment = pydub.AudioSegment.from_wav(wav_source + "/" + wav)\n\n    # Convert to mono if desired\n    use_stereo = False\n    if use_stereo:\n        assert segment.channels == 2\n    else:\n        segment = segment.set_channels(1)\n\n    # Define named sets of parameters\n    param_sets: T.Dict[str, SpectrogramParams] = {}\n\n    param_sets["default"] = SpectrogramParams(\n        sample_rate=segment.frame_rate,\n        stereo=use_stereo,\n        step_size_ms=20,\n        min_frequency=20,\n        max_frequency=20000,\n        num_frequencies=512,\n    )\n    print(segment.frame_rate)\n\n\n    images: T.Dict[str, Image.Image] = {}\n    for name, params in param_sets.items():\n        converter = SpectrogramImageConverter(params=params, device=device)\n        images[name] = converter.spectrogram_image_from_audio(segment)\n        segments[name] = converter.audio_from_spectrogram_image(\n            image=images[name],\n            apply_fi