In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

## Background noise

In [6]:
from pathlib import Path
from birdclef import birdnet
from birdclef.utils import get_spark
from pyspark.sql import functions as F
import pickle
import tqdm
import time
import tensorflow as tf
from functools import partial
import librosa
from birdclef.data.utils import slice_seconds
import numpy as np

repo_path = Path("../data/models/birdnet-analyzer-pruned")
birdnet_model = birdnet.load_model_from_repo(repo_path)
embedding_func = birdnet.embedding_func(birdnet_model)


model_path = Path("../data/models/baseline/logistic_binary.pkl")
clf = pickle.loads(model_path.read_bytes())

# re-encode the classes properly for the inference script on logistic model
le_path = Path("../data/models/baseline/logistic_binary_label_encoder.pkl")
le = pickle.loads(le_path.read_bytes())
clf.classes_ = le.classes_

spark = get_spark()
spark

In [20]:
def average_windows(prob):
    """Average a matrix of probabilities.

    We assume that these are a sliding window of probabilities: 1s sliding window of 3s context
    """
    # max might be better than the mean
    return prob.reshape(-1, 2, prob[0].size).max(axis=1)


def run_inference(path, embedding_func, clf, sr=48000):
    y, sr = librosa.load(path.as_posix(), sr=sr, mono=True)
    X = slice_seconds(y, sr, seconds=3, step=3)
    # drop every 4th/5th index, so we're not processing more than we need to
    # first pad the resulting slices by 2
    # X = np.pad(X, ((0, 2), (0, 0)))
    # # then reshape it
    # X = X.reshape(-1, 5, X.shape[-1])
    # # now drop the last 2 second of each 5 second frame
    # X = X[:, [0, 2], :].reshape(-1, X.shape[-1])
    # assert X.shape == (120 * 2, sr * 3), X.shape

    emb = embedding_func(X)[0]
    prob = clf.predict_proba(emb)
    # prob_prime = average_windows(prob)
    # assert prob_prime.shape == (120, len(clf.classes_)), prob_prime.shape
    rows = []
    for i, probs in enumerate(prob):
        ts = i * 3
        row = dict(
            row_id=f"{path.stem}_{ts}",
            **dict(zip(clf.classes_, np.around(probs, 6).tolist())),
        )
        rows.append(row)
    return rows


test_path = Path("../data/raw/background_audio/")
rows = []
timings = []
for path in tqdm.tqdm(test_path.glob("*.wav")):
    start = time.time()
    rows += run_inference(path, embedding_func, clf)
    timings.append(time.time() - start)

avg_time_sec = np.mean(timings)
est_time_min = avg_time_sec * 200 / 60
print(
    f"took {round(avg_time_sec,2)} seconds per loop, estimated {round(est_time_min,2)} minutes"
)

2034it [06:02,  5.61it/s]

took 0.18 seconds per loop, estimated 0.59 minutes





In [21]:
import pandas as pd

rows_df = pd.DataFrame(rows)
display(rows_df)

Unnamed: 0,row_id,call,no_call
0,0c983624-94c9-488f-a30a-ff4607d91844_0,0.538661,0.461339
1,0c983624-94c9-488f-a30a-ff4607d91844_3,0.388857,0.611143
2,0c983624-94c9-488f-a30a-ff4607d91844_6,0.584300,0.415700
3,0c983624-94c9-488f-a30a-ff4607d91844_9,0.651146,0.348854
4,324c7e9e-5962-49e2-ae32-c695a024ac26_0,0.107948,0.892052
...,...,...,...
8131,2aad176a-4d4e-44d2-a878-2196db6286ea_9,0.055792,0.944208
8132,2a8691c8-c0eb-43c0-a278-b89fd208ac89_0,0.111515,0.888485
8133,2a8691c8-c0eb-43c0-a278-b89fd208ac89_3,0.162488,0.837512
8134,2a8691c8-c0eb-43c0-a278-b89fd208ac89_6,0.128994,0.871006
