In [8]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

import numpy as np
import librosa

import csv
import io

from IPython.display import Audio

In [9]:
model_handle = "https://tfhub.dev/google/bird-vocalization-classifier/4"
model = hub.load(model_handle)

In [10]:
# Load the labels that the model was trained on.
# The labels file is in the assets forlder under label.csv. Each line is an ebird id.

# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  with open(labels_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    class_names = [name for name, in csv_reader]
    return class_names[1:]

labels_path = hub.resolve(model_handle) + "/assets/label.csv"
classes = class_names_from_csv(labels_path)

The ```frame_audio``` function is based on the [Chirp lib](https://github.com/google-research/chirp/blob/10c5faa325a3c3468fa6f18a736fc1aeb9bf8129/chirp/inference/interface.py#L128) version, using tf.signal instead of librosa.

The `ensure_sample_rate` is a function to ensure that any audio used with the model has the expected sample rate of 32kHz.

In [11]:
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 5.0,
      hop_size_s: float = 5.0,
      sample_rate = 32000,
  ) -> np.ndarray:
    """Helper function for framing audio for inference."""
    if window_size_s is None or window_size_s < 0:
      return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, hop_length, pad_end=True)
    return framed_audio

def ensure_sample_rate(waveform, original_sample_rate,
                       desired_sample_rate=32000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    waveform = tfio.audio.resample(waveform, original_sample_rate, desired_sample_rate)
  return desired_sample_rate, waveform

In [12]:
!curl -O  "https://upload.wikimedia.org/wikipedia/commons/1/17/Branta_canadensis_-_Canada_Goose_-_XC62259.ogg"
# 
# !curl -O  "https://xeno-canto.org/863673/download"
#
# !curl -O  "https://xeno-canto.org/863614/download"
#
# !curl -O  "https://xeno-canto.org/859024/download"
#
# !curl -O  "https://xeno-canto.org/858730/download"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  519k  100  519k    0     0  1362k      0 --:--:-- --:--:-- --:--:-- 1360k


In [13]:
canada_goose = "Branta_canadensis_-_Canada_Goose_-_XC62259.ogg"
# canada_goose = "/kaggle/working/download"

audio, sample_rate = librosa.load(canada_goose)

sample_rate, wav_data_goose = ensure_sample_rate(audio, sample_rate)
Audio(wav_data_goose, rate=sample_rate)

In [14]:
fixed_tm = frame_audio(wav_data_goose)
fixed_tm.shape

TensorShape([8, 160000])

In [15]:
logits, embeddings = model.infer_tf(fixed_tm[:1])

In [16]:
probabilities = tf.nn.softmax(logits)
argmax = np.argmax(probabilities)
print(f"The audio is from the class {classes[argmax]} (element:{argmax} in the label.csv file), with probability of {probabilities[0][argmax]}")

The audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999328851699829


In [17]:
all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])
for window in fixed_tm[1:]:
  logits, embeddings = model.infer_tf(window[np.newaxis, :])
  all_logits = np.concatenate([all_logits, logits], axis=0)

all_logits.shape

(8, 10932)

In [18]:
frame = 0
for frame_logits in all_logits:
  probabilities = tf.nn.softmax(frame_logits)
  argmax = np.argmax(probabilities)
  print(f"For frame {frame}, the audio is from the class {classes[argmax]} (element:{argmax} in the label.csv file), with probability of {probabilities[argmax]}")
  frame += 1

For frame 0, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999328851699829
For frame 1, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999147653579712
For frame 2, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999765157699585
For frame 3, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999567270278931
For frame 4, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999792575836182
For frame 5, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999881982803345
For frame 6, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.999988317489624
For frame 7, the audio is from the class cangoo (element:2021 in the label.csv file), with probability of 0.9999852180480957


In [19]:
# Works with one audio file
# Dataset used 