# Preprocessing

In [None]:
import os
import shutil

# Define the source and destination directories
source_dir = "/media/george-vengrovski/Extreme SSD/wav_and_npz_files"
temp_dir = "/media/george-vengrovski/Extreme SSD/temp"

# Create the temp directory if it doesn't exist
os.makedirs(temp_dir, exist_ok=True)

# Get all files in the source directory
all_files = os.listdir(source_dir)

# Separate wav and npz files
wav_files = {f[:-4] for f in all_files if f.endswith('.wav')}
npz_files = {f[:-4] for f in all_files if f.endswith('.npz')}

# Find non-matching npz and wav files
non_matching_npz = npz_files - wav_files
non_matching_wav = wav_files - npz_files

# Move non-matching npz files to temp directory
for file in non_matching_npz:
    src = os.path.join(source_dir, file + '.npz')
    dst = os.path.join(temp_dir, file + '.npz')
    shutil.move(src, dst)

# Move non-matching wav files to temp directory
for file in non_matching_wav:
    src = os.path.join(source_dir, file + '.wav')
    dst = os.path.join(temp_dir, file + '.wav')
    shutil.move(src, dst)

print(f"Moved {len(non_matching_npz)} non-matching npz files to {temp_dir}")
print(f"Moved {len(non_matching_wav)} non-matching wav files to {temp_dir}")

In [1]:
import numpy as np
import os
import csv
from scipy.io import wavfile

NFFT = 1024  # Number of points in FFT
step_size = 119  # Step size for overlap

source_dir = "/media/george-vengrovski/Extreme SSD/wav_and_npz_files"
output_dir = "/media/george-vengrovski/Extreme SSD/custom_dataset"
train_dir = os.path.join(output_dir, "train")
test_dir = os.path.join(output_dir, "test")

# Create output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

def process_file(npz_file, wav_file, output_folder, base_name):
    # Load npz file
    data = np.load(npz_file, allow_pickle=True)
    labels = data['song']

    # Load wav file
    sample_rate, audio = wavfile.read(wav_file)

    # Calculate time per label
    time_per_label = len(audio) / (sample_rate * len(labels))

    # Process labels
    onset = None
    current_label = None
    rows = []

    for i, label in enumerate(labels):
        if label != current_label:
            if onset is not None:
                offset = i * time_per_label
                cluster = 1 if current_label == 1 else 0
                rows.append([onset, offset, cluster])
            onset = i * time_per_label
            current_label = label

    # Add the last segment
    if onset is not None:
        offset = len(labels) * time_per_label
        cluster = 1 if current_label == 1 else 0
        rows.append([onset, offset, cluster])

    wav_path = os.path.join(output_folder, base_name + '.wav')
    csv_path = os.path.join(output_folder, base_name + '.csv')

    # Write audio to wav file
    wavfile.write(wav_path, sample_rate, audio)

    # Write labels to CSV
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['onset', 'offset', 'cluster'])
        writer.writerows(rows)

# Process files
all_files = os.listdir(source_dir)
npz_files = [f for f in all_files if f.endswith('.npz')]

for i, npz_file in enumerate(npz_files):
    base_name = npz_file[:-4]
    wav_file = base_name + '.wav'

    if wav_file in all_files:
        # Decide whether to put in train or test (80/20 split)
        output_folder = train_dir if i % 5 != 0 else test_dir

        npz_path = os.path.join(source_dir, npz_file)
        wav_path = os.path.join(source_dir, wav_file)

        try:
            # Process the file
            process_file(npz_path, wav_path, output_folder, base_name)

            print(f"Processed {base_name}")
        except Exception as e:
            print(f"Failed to process {base_name}: {e}")

print("Dataset creation completed.")

Processed USA5177_45268.29182755_12_8_8_6_22
Processed USA5177_45268.29814010_12_8_8_16_54
Processed USA5177_45268.29836207_12_8_8_17_16
Processed USA5177_45268.29874090_12_8_8_17_54
Processed USA5177_45268.29889742_12_8_8_18_9
Processed USA5177_45268.29899591_12_8_8_18_19
Processed USA5177_45268.30891574_12_8_8_34_51
Processed USA5177_45268.31223671_12_8_8_40_23
Processed USA5177_45268.31297012_12_8_8_41_37
Processed USA5177_45268.31311099_12_8_8_41_51
Processed USA5177_45268.31340083_12_8_8_42_20
Processed USA5177_45268.32678108_12_8_9_4_38
Processed USA5177_45268.33401526_12_8_9_16_41
Processed USA5177_45268.33917634_12_8_9_25_17
Processed USA5177_45268.34284536_12_8_9_31_24
Processed USA5177_45268.34315584_12_8_9_31_55
Processed USA5177_45268.34782881_12_8_9_39_42
Processed USA5177_45268.34814030_12_8_9_40_14
Processed USA5177_45268.34831609_12_8_9_40_31
Processed USA5177_45268.34849421_12_8_9_40_49
Processed USA5177_45268.51657912_12_8_14_20_57
Processed USA5177_45268.51677881_12_