In [16]:
# input: path to preprocessed audio files, labels, from 'preprocess.ipynb'
# output: saved embeddings of all clips, with their labels

import numpy as np
import os
import tensorflow_hub as hub 
import librosa

In [17]:
# Define the embedding generator class 

class EmbeddingGenerator:
    def __init__(self, model_url='https://www.kaggle.com/models/google/bird-vocalization-classifier/TensorFlow2/bird-vocalization-classifier/8', sr=32000, target_samples=160000):
        """ 
        Initialize the EmbeddingGenerator with the Bird Vocalization Classifier model.

        Args:                                                                                                                           
            model_url (str): URL to load the Bird Vocalization Classifier model.
            sr (int): Sample rate for the audio segments (default is 32000).
            target_samples (int): Target number of samples for each segment (default is 160000).
        """
        self.model = hub.load(model_url)
        self.sr = sr
        self.target_samples = target_samples

    def preprocess_segment(self, segment):
        """ 
        Preprocesses a single audio segment by padding or truncating to the target sample length.

        Args:
            segment (np.array): The audio segment to preprocess.

        Returns:
            np.array: Padded or truncated audio segment ready for embedding.
        """
        current_samples = len(segment)
            
        if current_samples < self.target_samples:
            # Calculate padding lengths and pad with zeros
            pad_length = self.target_samples - current_samples
            pad_left = pad_length // 2
            pad_right = pad_length - pad_left
            padded_segment = np.pad(segment, (pad_left, pad_right), 'constant')
            return padded_segment
        else:
            # Truncate if longer than target duration
            return segment[:self.target_samples]
        
    def generate_embeddings(self, segments):
        """
        Generates embeddings for a list of audio segments using the Bird Vocalization Classifier model.

        Args:
            segments (list of np.array): List of preprocessed audio segments.

        Returns:
            np.array: Array of embeddings for each segment.
        """
        embeddings = []
        
        for segment in segments:
            # Preprocess each segment to match the model's target sample length
            processed_segment = self.preprocess_segment(segment)
            
            # Reshape to fit the model's expected input dimensions
            audio_input = processed_segment[np.newaxis, :]  # Shape (1, 160000)
            
            # Generate embeddings using the model
            result = self.model.infer_tf(audio_input)
            embedding = result['embedding']
            
            # Flatten and store the embedding
            embeddings.append(embedding.numpy().flatten())

        return np.array(embeddings)
    
# define the segment loader: 


def load_denoised_segments(output_base_path):
    """
    Load the denoised audio segments from the saved directory.

    Args:
    - output_base_path (str): Base path for output directories where the function saved the segments.

    Returns:
    - denoised_segments (list): A list of numpy arrays representing the denoised audio segments.
    """
    denoised_segments = []
    
    denoised_dir = os.path.join(output_base_path, 'segmented_audio', 'denoised')

    # Traverse the species directories within the denoised directory
    for species_dir in os.listdir(denoised_dir):
        species_path = os.path.join(denoised_dir, species_dir)

        if os.path.isdir(species_path):  # Ensure it's a directory
            # Load all .wav files in the species directory
            for file in os.listdir(species_path):
                if file.endswith('.wav'):  # Only process .wav files
                    file_path = os.path.join(species_path, file)
                    # Load the audio file
                    try:
                        audio, sr = librosa.load(file_path, sr=None)  # Load with original sampling rate
                        denoised_segments.append(audio)
                    except Exception as e:
                        print(f"Error loading file {file_path}: {e}")
    
    return denoised_segments



In [21]:
# define where the data is, and the folders to read
base_dir = "/home/leah_colossal_com/tbp_dataset/balanced_dataset/test_data"

folders = [name for name in os.listdir(base_dir) 
    if os.path.isdir(os.path.join(base_dir, name)) and name != 'model']

#folders = ['white_throated_sparrow','northern_cardinal',
#                'carolina_wren','eastern_towhee', 'kentucky_warbler']


# Make embeddings for each folder
embedding_generator = EmbeddingGenerator()
for folder_idx,folder in enumerate(folders):

    # define files and folders
    processed_dir = os.path.join(base_dir,folder,'processed')
    labels_file = os.path.join(processed_dir,"labels.npy")
    embeddings_file = os.path.join(processed_dir,"embeddings.npz")               
    
    # load the preprocessed data                
    denoised_segments = load_denoised_segments(processed_dir)
    print(f"Loaded {len(denoised_segments)} denoised segments.")
    
    # Generate and save embeddings for the denoised segments
    embeddings = embedding_generator.generate_embeddings(denoised_segments)
    np.savez_compressed(embeddings_file, *embeddings)

    # Generate and save labels for these embeddings
    labels = [folder]*len(embeddings)
    np.save(labels_file, labels)
    

FileNotFoundError: [Errno 2] No such file or directory: '/home/leah_colossal_com/tbp_dataset/balanced_dataset/test_data/model/processed/segmented_audio/denoised'