# Analysis of Bioacoustic Data

This notebook provides tools for analyzing data using a custom classifier (developed with `agile_modeling.ipynb`).

In [0]:
#@title Installation. { vertical-output: true }
#@markdown You will likely need to work with `embed_audio.ipynb` and/or
#@markdown `agile_modeling.ipynb` before working with this notebook.
#@markdown
#@markdown Run this notebook in Google Colab by following
#@markdown [this link](https://colab.research.google.com/github/google-research/perch/blob/main/agile_modeling.ipynb).
#@markdown
#@markdown Run this cell to install the project dependencies.
%pip install git+https://github.com/google-research/perch.git


In [0]:
#@title Imports. { vertical-output: true }

import collections
from etils import epath
from ml_collections import config_dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from chirp.inference import colab_utils
colab_utils.initialize(use_tf_gpu=True, disable_warnings=True)

from chirp.inference import baw_utils
from chirp.inference import call_density
from chirp.inference import tf_examples
from chirp.inference.search import bootstrap
from chirp.inference.search import search
from chirp.inference.search import display
from chirp.inference.classify import classify
from perch_hoplite.zoo import zoo_interface


In [0]:
#@title Basic Configuration. { vertical-output: true }

data_source = 'filesystem'  #@param['filesystem', 'a2o'] {type:'string'}
baw_auth_token = '' #@param

#@markdown Define the model: Usually perch or birdnet.
model_choice = 'perch'  #@param {type:'string'}
#@markdown Set the base directory for the project.
working_dir = '/tmp/agile'  #@param {type:'string'}

# Set the embedding and labeled data directories.
labeled_data_path = epath.Path(working_dir) / 'labeled'
custom_classifier_path = epath.Path(working_dir) / 'custom_classifier'

# The embeddings_path should be detected automatically, but can be overridden.
embeddings_path = ''


In [0]:
#@title Load Existing Project State and Models. { vertical-output: true }

if data_source == 'a2o':
  embedding_config = baw_utils.get_a2o_embeddings_config()
  bootstrap_config = bootstrap.BootstrapConfig.load_from_embedding_config(
      embedding_config=embedding_config,
      annotated_path=labeled_data_path,
      embeddings_glob='*/embeddings-*')
  embeddings_path = embedding_config.output_dir
elif (embeddings_path
      or (epath.Path(working_dir) / 'embeddings/config.json').exists()):
  if not embeddings_path:
    # Use the default embeddings path, as it seems we found a config there.
    embeddings_path = epath.Path(working_dir) / 'embeddings'
  # Get relevant info from the embedding configuration.
  bootstrap_config = bootstrap.BootstrapConfig.load_from_embedding_path(
      embeddings_path=embeddings_path,
      annotated_path=labeled_data_path)
  baw_auth_token = ''
else:
  raise ValueError('No embedding configuration found.')

project_state = bootstrap.BootstrapState(
    bootstrap_config, baw_auth_token=baw_auth_token)

cfg = config_dict.ConfigDict({
    'model_path': custom_classifier_path,
    'logits_key': 'custom',
})
logits_head = zoo_interface.LogitsOutputHead.from_config(cfg)
model = logits_head.logits_model
class_list = logits_head.class_list
print('Loaded custom model with classes: ')
print('\t' + '\n\t'.join(class_list.classes))

In [0]:
#@title Write classifier inference CSV. { vertical-output: true }

#@markdown This cell writes detections (locations of audio windows where
#@markdown the logit was greater than a threshold) to a CSV file.

output_filepath = epath.Path(working_dir) / 'inference.csv'  #@param

#@markdown Set the default detection thresholds, used for all classes.
#@markdown To set per-class detection thresholds, modify the code below.
#@markdown Keep in mind that thresholds are on the logit scale, so 0.0
#@markdown corresponds to a 50% model confidence.
default_threshold = 0.0  #@param
if default_threshold is None:
  # In this case, all logits are written. This can lead to very large CSV files.
  class_thresholds = None
else:
  class_thresholds = collections.defaultdict(lambda: default_threshold)
  # Set per-class thresholds here.
  class_thresholds['my_class'] = 1.0

#@markdown Classes to ignore when counting detections.
exclude_classes = ['unknown']  #@param

#@markdown The `include_classes` list is ignored if empty.
#@markdown If non-empty, only scores for these classes will be written.
include_classes = []  #@param

embeddings_ds = tf_examples.create_embeddings_dataset(
    embeddings_path, file_glob='embeddings-*')

classify.write_inference_csv(
    embeddings_ds=embeddings_ds,
    model=logits_head,
    labels=class_list.classes,
    output_filepath=output_filepath,
    threshold=class_thresholds,
    embedding_hop_size_s=bootstrap_config.embedding_hop_size_s,
    include_classes=include_classes,
    exclude_classes=exclude_classes)

## Call Density Estimation

See 'All Thresholds Barred': https://arxiv.org/abs/2402.15360

In [0]:
#@title Validation and Call Density. { vertical-output: true }

target_class = 'my_class'  #@param {type:'string'}

#@markdown Bin bounds for validation. Should be an ordered list, beginning with
#@markdown 0.0 and ending with 1.0.
quantile_bounds = [0.0, 0.9, 0.99, 1.0]  #@param
#@markdown Number of validation samples per bin.
samples_per_bin = 25  #@param

quantile_bounds = np.array(quantile_bounds)
top_k = call_density.get_random_sample_size(quantile_bounds, samples_per_bin)

embeddings_ds = project_state.create_embeddings_dataset(shuffle_files=True)
results, all_logits = search.classifer_search_embeddings_parallel(
    embeddings_classifier=logits_head,
    target_index=class_list.classes.index(target_class),
    random_sample=True,
    top_k=top_k,
    hop_size_s=bootstrap_config.embedding_hop_size_s,
    embeddings_dataset=embeddings_ds,
)
combined_results = call_density.prune_random_results(
    results, all_logits, quantile_bounds, samples_per_bin)

ys, _, _, = plt.hist(all_logits, bins=100, density=True)
value_bounds = np.quantile(all_logits, quantile_bounds)
for q in value_bounds:
  plt.plot([q, q], [0.0, np.max(ys)], 'k:', alpha=0.75)
plt.show()


In [0]:
#@title Display Results. { vertical-output: true }

samples_per_page = 40  #@param
page_state = display.PageState(
    np.ceil(combined_results.top_k / samples_per_page))

display.display_paged_results(
    combined_results,
    page_state, samples_per_page,
    project_state=project_state,
    embedding_sample_rate=project_state.embedding_model.sample_rate,
    exclusive_labels=True,
    checkbox_labels=[target_class, f'not {target_class}', 'unsure'],
)

In [0]:
#@title Collate results and write validation log. { vertical-output: true }

validation_examples = call_density.convert_combined_results(
    combined_results=combined_results,
    target_class=target_class,
    quantile_bounds=quantile_bounds,
    value_bounds=value_bounds)
validation_log_path = call_density.write_validation_log(
    validation_examples,
    working_dir,
    target_class)
print('wrote log to : ', validation_log_path)


In [0]:
#@title Estimate Call Density and ROC-AUC. { vertical-output: true }

validation_examples = call_density.load_validation_log(validation_log_path)
density_ev , density_samples = call_density.estimate_call_density(
    validation_examples)

# Plot call density estimate.
plt.figure(figsize=(10, 5))
xs, ys, _ = plt.hist(density_samples, density=True, bins=25, alpha=0.25)
plt.plot([density_ev, density_ev], [0.0, np.max(xs)], 'k:', alpha=0.75,
         label='density_ev')

low, high = np.quantile(density_samples, [0.05, 0.95])
plt.plot([low, low], [0.0, np.max(xs)], 'g', alpha=0.75, label='low conf')
plt.plot([high, high], [0.0, np.max(xs)], 'g', alpha=0.75, label='high conf')

plt.xlim(0.0, 1.0)
plt.xlabel('Call Rate (q)')
plt.ylabel('P(q)')
plt.title(f'Call Density Estimation ({target_class})')
plt.legend()
plt.show()

print(f'EV Call Density: {density_ev:.4f}')
print(f'(Low/EV/High) Call Density Estimate: ({low:5.4f} / {density_ev:5.4f} / {high:5.4f})')

roc_auc_estimate = call_density.estimate_roc_auc(validation_examples)
print(f'Estimated ROC-AUC : {roc_auc_estimate:5.4f}')

In [0]:
#@title Display Logged Validation Examples. { vertical-output: true }

validation_results = search.TopKSearchResults(top_k=len(validation_examples))
for v in validation_examples:
  validation_results.update(v.to_search_result(
      target_class, project_state.embedding_model.sample_rate))

samples_per_page = 40  #@param
page_state = display.PageState(
    np.ceil(combined_results.top_k / samples_per_page))

display.display_paged_results(
    validation_results,
    page_state, samples_per_page,
    project_state=project_state,
    embedding_sample_rate=project_state.embedding_model.sample_rate,
    exclusive_labels=True,
    checkbox_labels=[target_class, f'not {target_class}', 'unsure'],
)