<a href="https://colab.research.google.com/github/duane-edgington/google-multispecies-whale-detection/blob/main/sigmoid_duane_audio_chunks_test_google_multispecies_whale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path

import tensorflow as tf
import tensorflow_hub as hub
import os
import glob

In [None]:
import time
from datetime import timedelta

In [None]:
MODEL = 'https://www.kaggle.com/models/google/multispecies-whale/TensorFlow2/default/2'
WAV_PATH = 'gs://bioacoustics-www1/sounds/Cross_24kHz.wav'

In [None]:
# Connect Google Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) # This will prompt for authorization.

# This will create the input audio file directories if they don't exist.
folders =  ["audio_chunks"]

for folder in folders:
  path = "/content/drive/MyDrive/" + folder
  if not os.path.exists(path): # Create the folder if it does not exist
    os.mkdir(path)

Mounted at /content/drive


In [None]:
!find ~/ -name 'saved_model.pb'

In [None]:
# Record the start time
start_time = time.time()


##Upload any audio files into the "audio_chunks" folder in your Google Drive

In [None]:
model = hub.load(MODEL)

#print(model.signatures.keys())
#print(model.variables)

metadata = model.metadata()

#print(metadata.keys())

byte_class_names = metadata['class_names'].numpy()


In [None]:
'''
## This block would load and prepare TensorBoard

%load_ext tensorboard
%tensorboard --logdir logs
'''

'\n## This block would load and prepare TensorBoard\n\n%load_ext tensorboard\n%tensorboard --logdir logs\n'

In [None]:
def run_model(wav_file: str, f):
    waveform, sample_rate = tf.audio.decode_wav(tf.io.read_file(wav_file),desired_channels=1,desired_samples=-1)
    # tf.audio.decode_wav decodes a 16bit PCM WAV file to a float tensor
    # The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
    # desired_channels=1 means decode one channel
    # desired_samples=-1 means decode all samples
    batch = tf.expand_dims(waveform, 0)
    #here, tf.expand_dims inserts a dimension of length 1 index 0 of waveform's shape.
    #This is useful to add an outer "batch" dimension to a single element

    spectrogram = model.front_end(batch)
    # This line calls a function or method named front_end on your model object.
    # This function is designed to extract features from the audio data in batch.
    # The output of this function is expected to be a spectrogram.

    print(spectrogram.shape)

    context_windows = tf.signal.frame(
        tf.squeeze(spectrogram, 0),
        frame_length=128,
        frame_step=64,
        axis=-2,
    )

    logits = model.logits(context_windows)
    features = model.features(context_windows)
    probabilities = tf.nn.sigmoid(logits)

    raw_predictions = logits.numpy()
    print(raw_predictions)
    print(raw_predictions.shape)

    save_probabilities = probabilities.numpy()
    print(save_probabilities)

    # first print out the logits in order
     # Find the top 10 classes in the probabilities
    top_logits_classes = tf.argsort(save_probabilities, axis=-1, direction='DESCENDING')[0, :10]
    p1 = tf.nn.sigmoid(logits)[0, top_logits_classes[0]].numpy()
    p2 = tf.nn.sigmoid(logits)[0, top_logits_classes[1]].numpy()
    p3 = tf.nn.sigmoid(logits)[0, top_logits_classes[2]].numpy()
    p4 = tf.nn.sigmoid(logits)[0, top_logits_classes[3]].numpy()
    p5 = tf.nn.sigmoid(logits)[0, top_logits_classes[4]].numpy()
    p6 = tf.nn.sigmoid(logits)[0, top_logits_classes[5]].numpy()
    p7 = tf.nn.sigmoid(logits)[0, top_logits_classes[6]].numpy()
    p8 = tf.nn.sigmoid(logits)[0, top_logits_classes[7]].numpy()
    p9 = tf.nn.sigmoid(logits)[0, top_logits_classes[8]].numpy()
    p10 = tf.nn.sigmoid(logits)[0, top_logits_classes[9]].numpy()


    save_probabilities = [float(p1), float(p2), float(p3), float(p4), float(p5), float(p6), float(p7),float(p8),float(p9),float(p10)]

    class_names = [name.decode('utf-8') for name in byte_class_names[top_logits_classes]]
    f.write(f'{wav_file}, {class_names},"sigmoid probablitites: ,"{save_probabilities}\n')
    print(f"Top 10 classes: {class_names}, sigmoid probabilities: {save_probabilities}\n")

'''
    # Then, do softmax and Find the top 5 classes in the probabilities
    top_classes = tf.argsort(probabilities, axis=-1, direction='DESCENDING')[0, :5]
    p1 = tf.nn.softmax(logits, axis=-1)[0, top_classes[0]].numpy()
    p2 = tf.nn.softmax(logits, axis=-1)[0, top_classes[1]].numpy()
    p3 = tf.nn.softmax(logits, axis=-1)[0, top_classes[2]].numpy()
    p4 = tf.nn.softmax(logits, axis=-1)[0, top_classes[3]].numpy()x`
    p5 = tf.nn.softmax(logits, axis=-1)[0, top_classes[4]].numpy()
    probabilities = [float(p1), float(p2), float(p3), float(p4), float(p5)]

    # Print and save the results
    class_names = [name.decode('utf-8') for name in byte_class_names[top_classes]]
    f.write(f'{wav_file}, {class_names}, {probabilities}\n')
    print(f"Top 5 classes: {class_names}, softmax probabilities: {probabilities}\n")
    '''

'\n    # Then, do softmax and Find the top 5 classes in the probabilities\n    top_classes = tf.argsort(probabilities, axis=-1, direction=\'DESCENDING\')[0, :5]\n    p1 = tf.nn.softmax(logits, axis=-1)[0, top_classes[0]].numpy()\n    p2 = tf.nn.softmax(logits, axis=-1)[0, top_classes[1]].numpy()\n    p3 = tf.nn.softmax(logits, axis=-1)[0, top_classes[2]].numpy()\n    p4 = tf.nn.softmax(logits, axis=-1)[0, top_classes[3]].numpy()\n    p5 = tf.nn.softmax(logits, axis=-1)[0, top_classes[4]].numpy()\n    probabilities = [float(p1), float(p2), float(p3), float(p4), float(p5)]\n\n    # Print and save the results\n    class_names = [name.decode(\'utf-8\') for name in byte_class_names[top_classes]]\n    f.write(f\'{wav_file}, {class_names}, {probabilities}\n\')\n    print(f"Top 5 classes: {class_names}, softmax probabilities: {probabilities}\n")\n    '

In [None]:
blue_path = Path('/content/drive/MyDrive/blueA/')
print(blue_path)

sound_path = Path('/content/drive/MyDrive/audio_chunks/')
#sound_path = Path('/content/drive/MyDrive/July_15_2025_audio_chunks')
print(sound_path)

urban_path = Path('/content/drive/MyDrive/UrbanSound8K/fold1/')
print(urban_path)

/content/drive/MyDrive/blueA
/content/drive/MyDrive/audio_chunks
/content/drive/MyDrive/UrbanSound8K/fold1


In [None]:
!ls /content/drive/MyDrive/audio_chunks/

MARS_20180401_060914_resampled_24kHz_chunk_001.wav
MARS_20180401_060914_resampled_24kHz_chunk_002.wav
MARS_20180401_060914_resampled_24kHz_chunk_003.wav
MARS_20180401_060914_resampled_24kHz_chunk_004.wav
MARS_20180401_060914_resampled_24kHz_chunk_005.wav
MARS_20180401_060914_resampled_24kHz_chunk_006.wav
MARS_20180401_060914_resampled_24kHz_chunk_007.wav
MARS_20180401_060914_resampled_24kHz_chunk_008.wav
MARS_20180401_060914_resampled_24kHz_chunk_009.wav
MARS_20180401_060914_resampled_24kHz_chunk_010.wav
MARS_20180401_060914_resampled_24kHz_chunk_011.wav
MARS_20180401_060914_resampled_24kHz_chunk_012.wav
MARS_20180401_060914_resampled_24kHz_chunk_013.wav
MARS_20180401_060914_resampled_24kHz_chunk_014.wav
MARS_20180401_060914_resampled_24kHz_chunk_015.wav
MARS_20180401_060914_resampled_24kHz_chunk_016.wav
MARS_20180401_060914_resampled_24kHz_chunk_017.wav
MARS_20180401_060914_resampled_24kHz_chunk_018.wav
MARS_20180401_060914_resampled_24kHz_chunk_019.wav
MARS_20180401_060914_resampled_

In [None]:
# Grab all the bat calls and run the model
#with open('google_species_mbari_blueA.out.txt', 'w') as f:
#    for wav_file in blue_path.glob('bat*16kHz*.wav'):
#        print(wav_file.as_posix())
#        run_model(wav_file.as_posix(), f)


In [None]:
# Grab all the audio chunks from sound recording and run the model
with open('google_species_mbari_sound_5sec.txt', 'w') as f:
     for wav_file in sound_path.glob('*.wav'):
        print(wav_file.as_posix())
        run_model(wav_file.as_posix(), f)

/content/drive/MyDrive/audio_chunks/MARS_20180401_060914_resampled_24kHz_chunk_014.wav
(1, 161, 128)
[[ -6.044093   -7.8251767 -10.993959   -9.966471   -8.401753   -8.672831
   -8.571198  -10.983252   -8.028532   -9.87575    -9.172934  -13.28414  ]]
(1, 12)
[[2.3662187e-03 3.9938840e-04 1.6802613e-05 4.6945752e-05 2.2442300e-04
  1.7114455e-04 1.8944948e-04 1.6983497e-05 3.2592021e-04 5.1403669e-05
  1.0380075e-04 1.7012602e-06]]
Top 10 classes: ['Oo', 'Mn', 'Echolocation', 'Upcall', 'Call', 'Bp', 'Whistle', 'Bm', 'Be', 'Gunshot'], sigmoid probabilities: [0.002366218715906143, 0.00039938840200193226, 0.00032592020579613745, 0.00022442299814429134, 0.00018944947805721313, 0.00017114455113187432, 0.00010380074672866613, 5.140366920386441e-05, 4.6945751819293946e-05, 1.6983496607281268e-05]

/content/drive/MyDrive/audio_chunks/MARS_20180401_060914_resampled_24kHz_chunk_080.wav
(1, 161, 128)
[[ -3.9855518  -6.970813  -13.844017  -10.490062  -10.206969   -8.266721
   -7.782776  -11.443747  

In [None]:
!ls google_species_mbari_sound_5sec.txt
!cat google_species_mbari_sound_5sec.txt

google_species_mbari_sound_5sec.txt
/content/drive/MyDrive/audio_chunks/MARS_20180401_060914_resampled_24kHz_chunk_014.wav, ['Oo', 'Mn', 'Echolocation', 'Upcall', 'Call', 'Bp', 'Whistle', 'Bm', 'Be', 'Gunshot'],"sigmoid probablitites: ,"[0.002366218715906143, 0.00039938840200193226, 0.00032592020579613745, 0.00022442299814429134, 0.00018944947805721313, 0.00017114455113187432, 0.00010380074672866613, 5.140366920386441e-05, 4.6945751819293946e-05, 1.6983496607281268e-05]
/content/drive/MyDrive/audio_chunks/MARS_20180401_060914_resampled_24kHz_chunk_080.wav, ['Oo', 'Mn', 'Echolocation', 'Call', 'Bp', 'Whistle', 'Bm', 'Upcall', 'Be', 'Gunshot'],"sigmoid probablitites: ,"[0.01824318617582321, 0.0009380088304169476, 0.0005961452843621373, 0.0004166797734797001, 0.0002568604832049459, 0.0002199877635575831, 8.179630822269246e-05, 3.691080564749427e-05, 2.781070907076355e-05, 1.0716162250901107e-05]
/content/drive/MyDrive/audio_chunks/MARS_20180401_060914_resampled_24kHz_chunk_027.wav, ['Mn',

In [None]:
!cp google_species_mbari_sound_5sec.txt /content/drive/MyDrive/audio_chunks/

In [None]:
!ls /content/drive/MyDrive/audio_chunks/

google_species_mbari_sound_5sec.txt
MARS_20180401_060914_resampled_24kHz_chunk_001.wav
MARS_20180401_060914_resampled_24kHz_chunk_002.wav
MARS_20180401_060914_resampled_24kHz_chunk_003.wav
MARS_20180401_060914_resampled_24kHz_chunk_004.wav
MARS_20180401_060914_resampled_24kHz_chunk_005.wav
MARS_20180401_060914_resampled_24kHz_chunk_006.wav
MARS_20180401_060914_resampled_24kHz_chunk_007.wav
MARS_20180401_060914_resampled_24kHz_chunk_008.wav
MARS_20180401_060914_resampled_24kHz_chunk_009.wav
MARS_20180401_060914_resampled_24kHz_chunk_010.wav
MARS_20180401_060914_resampled_24kHz_chunk_011.wav
MARS_20180401_060914_resampled_24kHz_chunk_012.wav
MARS_20180401_060914_resampled_24kHz_chunk_013.wav
MARS_20180401_060914_resampled_24kHz_chunk_014.wav
MARS_20180401_060914_resampled_24kHz_chunk_015.wav
MARS_20180401_060914_resampled_24kHz_chunk_016.wav
MARS_20180401_060914_resampled_24kHz_chunk_017.wav
MARS_20180401_060914_resampled_24kHz_chunk_018.wav
MARS_20180401_060914_resampled_24kHz_chunk_019

In [None]:
# Record the end time
end_time = time.time()

# Calculate the difference between start and end time
time_difference = end_time - start_time

# Convert the time difference to a human-readable format
time_difference_formatted = str(timedelta(seconds=time_difference))

# Output the results
print(f"Start Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
print(f"End Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
print(f"Time Difference: {time_difference_formatted}")

Start Time: 2025-08-19 23:20:19
End Time: 2025-08-19 23:20:49
Time Difference: 0:00:29.784324


In [None]:
#stop here
import sys
sys.exit("Stop execution")

SystemExit: Stop execution

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
#Grab all the audio chunks from orca recordings and run the model
#This cell is different from above in catching errors in input format
# this was important for running the UrbanSound8k dataset through the model
from scipy.io.wavfile import read, write
import librosa
with open('google_species_orca.1hourout_librosa.txt', 'w') as f:
    for wav_file in orca_path.glob('*.wav'):
        print(wav_file.as_posix())
        try:
            run_model(wav_file.as_posix(), f)
        except tf.errors.InvalidArgumentError as e:
            print(f"Error processing {wav_file}: {e}")
            # If tf.audio.decode_wav fails, try using librosa
            try:
                y, sr = librosa.load(wav_file.as_posix(), sr=None)
                # Convert to the format expected by your model
                waveform = tf.convert_to_tensor(y, dtype=tf.float32)
                sample_rate = tf.convert_to_tensor(sr, dtype=tf.int64)
                # Continue with the rest of your processing using waveform and sample_rate
                batch = tf.expand_dims(waveform, 0)

                spectrogram = model.front_end(batch)
                context_windows = tf.signal.frame(
                    tf.squeeze(spectrogram, 0),
                    frame_length=128,
                    frame_step=64,
                    axis=-2,
                )

                logits = model.logits(context_windows)
                features = model.features(context_windows)
                probabilities = tf.nn.sigmoid(logits)

                # Find the top 3 classes in the probabilities
                top_classes = tf.argsort(probabilities, axis=-1, direction='DESCENDING')[0, :3]
                p1 = tf.nn.softmax(logits, axis=-1)[0, top_classes[0]].numpy()
                p2 = tf.nn.softmax(logits, axis=-1)[0, top_classes[1]].numpy()
                p3 = tf.nn.softmax(logits, axis=-1)[0, top_classes[2]].numpy()
                probabilities = [p1, p2, p3]

                # Print and save the results
                class_names = [name.decode('utf-8') for name in byte_class_names[top_classes]]
                f.write(f'{wav_file}, {class_names}, {probabilities}\n')
                print(f'Top 3 classes: {class_names}, probabilities: {probabilities}\n')

            except Exception as e:
                print(f"Error processing {wav_file} with librosa: {e}")

In [None]:
!ls google_species_orca.1hourout_librosa.txt
!cat google_species_orca.1hourout_librosa.txt