In [1]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
import os
import csv
import soundfile as sf
from scipy import signal
from datetime import datetime

In [2]:
# Directories for template audio, clips to make detections in, and output results
# Replace __species__ with the actual file name of the template you want to use in TEMPLATE_PATH

CLIPS_PATH = "data/clips"
TEMPLATE_PATH = "data/template/__species__"
OUTPUT_DIR = "data/output"

# Customizable parameters for template matching:
# Threshold for detection confidence
# Suppression distance to avoid multiple detections in close proximity

THRESHOLD = 0.6
SUPPRESSION_DISTANCE = 50

In [3]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Output file paths for results and CSV file

# The CSV file will contain all matches found during the template matching process, showing the name of the template, 
# the name of the clip, the timestamp of the match, and the score of the match.
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"results_{timestamp}.txt")
CSV_OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"all_matches_{timestamp}.csv")

In [4]:
# Computes mel spectrogram from an audio signal

def compute_mel_spectrogram(y, sr, n_mels=128, hop_length=512):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    S_dB = librosa.power_to_db(S, ref=np.max)
    return S_dB

# Converts a mel spectrogram in dB to an image format suitable for use in template matching or visualization

def spectrogram_to_image(S_dB):
    img = (S_dB - S_dB.min()) / (S_dB.max() - S_dB.min())
    img = (img * 255).astype(np.uint8)
    return img

# Finds the dominant frequency range in a mel spectrogram based on a particular energy threshold

def find_dominant_frequency_range(spectrogram, energy_threshold=0.1):
    freq_energy = np.mean(spectrogram, axis=1)
    freq_energy = (freq_energy - freq_energy.min()) / (freq_energy.max() - freq_energy.min())
    dominant_bins = np.where(freq_energy > energy_threshold)[0]
    return (dominant_bins.min(), dominant_bins.max() + 1) if len(dominant_bins) else (0, spectrogram.shape[0])

def filter_spectrogram_by_frequency_range(spectrogram, freq_min, freq_max):
    return spectrogram[freq_min:freq_max, :]

# Fast audio loading function that decimates or resamples audio to a target sample rate

def fast_audio_load(audio_path, target_sr=22050):
    y, original_sr = sf.read(audio_path)
    y = y[:, 0] if y.ndim > 1 else y
    if original_sr != target_sr:
        decimation = original_sr // target_sr
        try:
            y = signal.decimate(y, decimation) if decimation > 1 and len(y) > 100 else librosa.resample(y, orig_sr=original_sr, target_sr=target_sr)
        except ValueError:
            y = librosa.resample(y, orig_sr=original_sr, target_sr=target_sr)
        sr = target_sr
    else:
        sr = target_sr
    return y, sr

# Gets the species name from a Xeno-Canto filename

def get_species_name(filename):
    base_name = filename.replace('.mp3', '').replace('.wav', '').replace('.MP3', '').replace('.WAV', '')
    parts = base_name.split('_')
    return f"{parts[0]}_{parts[1]}" if len(parts) >= 2 else base_name

In [5]:
# Get the species name from the template path

species = get_species_name(TEMPLATE_PATH)
species_templates = {}
if species not in species_templates:
    species_templates[species] = TEMPLATE_PATH
selected_templates = list(species_templates.values())

In [6]:
# Create a CSV file to store the results of template matching

with open(CSV_OUTPUT_FILE, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['template_name', 'clip_name', 'timestamp', 'score'])

total_matches_count = 0

In [7]:
# Goes through each selected template and compares it with all audio clips
# For each template, it computes the spectrogram and filters it by frequency range
# Then, it compares the filtered template with each audio clip's spectrogram
# If a match is found, it records the match information
# The match information in the CSV file includes the template name, clip name, timestamp, and score

for template_file in selected_templates:
    print(f"Processing template: {template_file}")
    try:
        y_template, sr_template = librosa.load(os.path.join(TEMPLATE_PATH, template_file), sr=None)
        template_spec = compute_mel_spectrogram(y_template, sr_template)
        freq_min, freq_max = find_dominant_frequency_range(template_spec)
        template_spec_filtered = filter_spectrogram_by_frequency_range(template_spec, freq_min, freq_max)
        template_img = spectrogram_to_image(template_spec_filtered)
    except Exception as e:
        print(f"Skipping template {template_file} due to error: {e}")
        continue

    for clip in os.listdir(CLIPS_PATH):
        if not clip.lower().endswith('.wav'):
            continue
        try:
            y_clip, sr_clip = fast_audio_load(os.path.join(CLIPS_PATH, clip))
            clip_spec = compute_mel_spectrogram(y_clip, sr_clip)
            clip_spec_filtered = filter_spectrogram_by_frequency_range(clip_spec, freq_min, freq_max)
            clip_img = spectrogram_to_image(clip_spec_filtered)
            res = cv.matchTemplate(clip_img, template_img, cv.TM_CCOEFF_NORMED)
            locations = np.where(res >= THRESHOLD)
            matches = sorted([(pt[1], pt[0], res[pt[1], pt[0]]) for pt in zip(*locations[::-1])], key=lambda x: x[2], reverse=True)
        except Exception as e:
            print(f"Skipping clip {clip} due to error: {e}")
            continue

        seconds_per_col = 512 / sr_clip
        with open(CSV_OUTPUT_FILE, 'a', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            for y, x, score in matches:
                csv_writer.writerow([template_file.replace('.mp3','').replace('.wav',''), clip, x * seconds_per_col, score])
        total_matches_count += len(matches)

Processing template: data/template/__species__
Skipping template data/template/__species__ due to error: [Errno 2] No such file or directory: 'data/template/__species__\\data/template/__species__'


  y_template, sr_template = librosa.load(os.path.join(TEMPLATE_PATH, template_file), sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [8]:
# Prints out the total number of matches found and the directory of the CSV file with the information about the matches found

print(f"Total matches: {total_matches_count}")
print(f"CSV output saved to: {CSV_OUTPUT_FILE}")

Total matches: 0
CSV output saved to: data/output\all_matches_20250812_102302.csv
