In [1]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
import os
import csv
import soundfile as sf
from scipy import signal
from datetime import datetime

In [None]:
# Directories for template audio, clips to make detections in, and output results
# Replace __species__ with the actual file name of the template you want to use in TEMPLATE_PATH

CLIPS_PATH = "data/clips"
TEMPLATE_PATH = "data/template/__species__"
OUTPUT_DIR = "data/output"

# Customizable parameters for template matching:
# Threshold for detection confidence
# Suppression distance to avoid multiple detections in close proximity

THRESHOLD = 0.6
SUPPRESSION_DISTANCE = 50

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Output file paths for results and CSV file

# The CSV file will contain all matches found during the template matching process, showing the name of the template, 
# the name of the clip, the timestamp of the match, and the score of the match.
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"results_{timestamp}.txt")
CSV_OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"all_matches_{timestamp}.csv")

In [None]:
# Computes mel spectrogram from an audio signal

def compute_mel_spectrogram(y, sr, n_mels=128, hop_length=512):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    S_dB = librosa.power_to_db(S, ref=np.max)
    return S_dB

# Converts a mel spectrogram in dB to an image format suitable for use in template matching or visualization

def spectrogram_to_image(S_dB):
    img = (S_dB - S_dB.min()) / (S_dB.max() - S_dB.min())
    img = (img * 255).astype(np.uint8)
    return img

# Finds the dominant frequency range in a mel spectrogram based on a particular energy threshold

def find_dominant_frequency_range(spectrogram, energy_threshold=0.1):
    freq_energy = np.mean(spectrogram, axis=1)
    freq_energy = (freq_energy - freq_energy.min()) / (freq_energy.max() - freq_energy.min())
    dominant_bins = np.where(freq_energy > energy_threshold)[0]
    return (dominant_bins.min(), dominant_bins.max() + 1) if len(dominant_bins) else (0, spectrogram.shape[0])

def filter_spectrogram_by_frequency_range(spectrogram, freq_min, freq_max):
    return spectrogram[freq_min:freq_max, :]

# Fast audio loading function that decimates or resamples audio to a target sample rate

def fast_audio_load(audio_path, target_sr=22050):
    y, original_sr = sf.read(audio_path)
    y = y[:, 0] if y.ndim > 1 else y
    if original_sr != target_sr:
        decimation = original_sr // target_sr
        try:
            y = signal.decimate(y, decimation) if decimation > 1 and len(y) > 100 else librosa.resample(y, orig_sr=original_sr, target_sr=target_sr)
        except ValueError:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=target_sr)
        sr = target_sr
    else:
        sr = target_sr
    return y, sr

# Gets the species name from a Xeno-Canto filename

def get_species_name(filename):
    base_name = filename.replace('.mp3', '').replace('.wav', '').replace('.MP3', '').replace('.WAV', '')
    parts = base_name.split('_')
    return f"{parts[0]}_{parts[1]}" if len(parts) >= 2 else base_name