In [4]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import cv2
from tqdm import tqdm

In [1]:
threshold = -40

In [2]:
# https://www.kaggle.com/code/muhammadfauzannafiz/save-mel-spectrogram-v2

def sound_wave_to_mel_spectrogram(sound_wave, sample_rate, spec_h=227, spec_w=227, duration=None, fmin = 0, fmax=8000):
    NUM_MELS = spec_h
    HOP_LENGTH = int(sample_rate * duration / (spec_w - 1)) if duration else int(sample_rate / (spec_w - 1))
    mel_spec = librosa.feature.melspectrogram(y=sound_wave, sr=sample_rate, hop_length=HOP_LENGTH, n_mels=NUM_MELS, fmin = fmin ,fmax=fmax)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_db[mel_spec_db < threshold] = threshold
    return mel_spec_db

def read_sound_wave(file_path, duration=None):
    sound_wave, sample_rate = librosa.load(file_path, sr=None, duration=duration)
    return sound_wave, sample_rate


def audio_segment_to_mel_spectrogram_rgb_227(audio_path, output_dir, spec_w, spec_h, duration=None):

    # Load the audio file
    y, sr = read_sound_wave(audio_path, duration=duration)

    # Generate mel-spectrogram
    # mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)

    # # Convert to decibels
    # mel_spec = librosa.power_to_db(mel_spectrogram, ref=np.max)
    # mel_spec[mel_spec < -40] = -40
    mel_spec = sound_wave_to_mel_spectrogram(y, sr, spec_w = spec_w, spec_h = spec_h, duration=duration)

    # Plot mel-spectrogram without axes, title, or colorbar
    plt.figure(figsize=(spec_h/300, spec_h/300), dpi=300)
    librosa.display.specshow(mel_spec, sr=sr, x_axis='time', y_axis='mel')
    plt.axis('off')
        
    # Get the original audio file name without extension
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    
    # Save the image with the same name as the audio file
    output_image_path = os.path.join(output_dir, f'{audio_filename}.png')
    plt.savefig(output_image_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    
    # Load the saved image
    log_S = cv2.imread(output_image_path)

    # Resize the image using cv2.resize
    log_S_resized = cv2.resize(log_S, (spec_w, spec_h), interpolation=cv2.INTER_CUBIC)

    # Save the resized image
    cv2.imwrite(output_image_path, log_S_resized)

def process_directory_structure(base_directory, img_width=227, img_height=227):
    for root, dirs, files in os.walk(base_directory):
        if 'denoised' in root and 'segmented_audio' in root:
            # Determine the corresponding melspectrogram denoised path
            base_path = root.split(os.sep)
            if 'segmented_audio' in base_path:
                seg_idx = base_path.index('segmented_audio')
                melspectrogram_path_parts = base_path[:seg_idx] + ['melspectrogram', 'denoised'] + base_path[seg_idx + 2:]
                melspectrogram_path = os.sep.join(melspectrogram_path_parts)

                for file in tqdm(files, desc=f"Processing {root}"):
                    if file.endswith('.wav'):
                        input_path = os.path.join(root, file)
                        output_dir = melspectrogram_path
                        os.makedirs(output_dir, exist_ok=True)
                        
                        audio_segment_to_mel_spectrogram_rgb_227(input_path, output_dir, img_width, img_height)
                        # print(f"Saved melspectrogram for {input_path} to {output_dir}")


In [5]:
# Example usage
# base_directory = '/home/jupyter/data/test_data'
base_directory = '/home/jupyter/data/processed/uncertain'
process_directory_structure(base_directory)           

Processing /home/jupyter/data/processed/uncertain/segmented_audio/denoised: 100%|██████████| 94/94 [00:22<00:00,  4.11it/s]
Processing /home/jupyter/data/processed/uncertain/segmented_audio/denoised/.ipynb_checkpoints: 0it [00:00, ?it/s]


# Another method
https://www.kaggle.com/code/ritvik1909/speech-classification-spectrogram-cnn


In [41]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

def sound_wave_to_mel_spectrogram(sound_wave, sample_rate, spec_h=256, spec_w=256, length=1):
    NUM_MELS = spec_h
    HOP_LENGTH = int(sample_rate * length / (spec_w - 1)) 
    mel_spec = librosa.feature.melspectrogram(y=sound_wave, sr=sample_rate, hop_length=HOP_LENGTH, n_mels=NUM_MELS)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_db[mel_spec_db < threshold] = threshold
    return mel_spec_db

def read_sound_wave(file_path, duration=None):
    sound_wave, sample_rate = librosa.load(file_path, sr=None, duration=duration)
    return sound_wave, sample_rate

def process_directory_structure(base_directory, spec_h=128, spec_w=128, length=1):
    for root, dirs, files in os.walk(base_directory):
        if 'denoised' in root and 'segmented_audio' in root:
            # Determine the corresponding melspectrogram denoised path
            base_path = root.split(os.sep)
            if 'segmented_audio' in base_path:
                seg_idx = base_path.index('segmented_audio')
                melspectrogram_path_parts = base_path[:seg_idx] + ['melspectrogram', 'denoised'] + base_path[seg_idx + 2:]
                melspectrogram_path = os.sep.join(melspectrogram_path_parts)

                for file in tqdm(files, desc=f"Processing {root}"):
                    if file.endswith('.wav'):
                        input_path = os.path.join(root, file)
                        output_dir = melspectrogram_path
                        os.makedirs(output_dir, exist_ok=True)
                        
                        sound_wave, sample_rate = read_sound_wave(input_path)
                        mel_spec_db = sound_wave_to_mel_spectrogram(sound_wave, sample_rate, spec_h, spec_w, length)
                        
                        # Save the spectrogram as an image
                        output_path = os.path.join(output_dir, file.replace('.wav', '.png'))
                        
                        # Correcting the vertical flip by reversing the y-axis
                        plt.figure(figsize=(spec_w / 300, spec_h / 300), dpi=300)
                        librosa.display.specshow(mel_spec_db, sr=sample_rate, x_axis='time', y_axis='mel', cmap='gray_r')
                        plt.axis('off')
                        plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
                        plt.close()

                        # print(f"Saved melspectrogram for {input_path} to {output_path}")

# Example usage
base_directory = '/Users/abhishek/colossal/data/processed/'
process_directory_structure(base_directory)


Processing /Users/abhishek/colossal/data/processed/pacific_imperial_pigeon/segmented_audio/denoised: 0it [00:00, ?it/s]
Processing /Users/abhishek/colossal/data/processed/pacific_imperial_pigeon/segmented_audio/denoised/many_color_fruit_dove: 100%|██████████| 8/8 [00:00<00:00, 23.43it/s]
Processing /Users/abhishek/colossal/data/processed/pacific_imperial_pigeon/segmented_audio/denoised/samoan_starling: 100%|██████████| 21/21 [00:00<00:00, 22.71it/s]
Processing /Users/abhishek/colossal/data/processed/pacific_imperial_pigeon/segmented_audio/denoised/Ducula: 100%|██████████| 140/140 [00:06<00:00, 20.35it/s]
Processing /Users/abhishek/colossal/data/processed/pacific_imperial_pigeon/segmented_audio/denoised/samoan_whistler: 100%|██████████| 23/23 [00:00<00:00, 27.08it/s]
Processing /Users/abhishek/colossal/data/processed/pacific_imperial_pigeon/segmented_audio/denoised/eastern_wattled_honeyeater: 100%|██████████| 123/123 [00:04<00:00, 25.20it/s]
Processing /Users/abhishek/colossal/data/proc