# Transference Learning Cats From ESC-50 Dataset

This is a modification of [this public notebook](https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/audio/transfer_learning_audio.ipynb) to accomplish binary classification instead of having multiple classes.

In [3]:
%pip install -q tensorflow==2.13.1 tensorflow-hub==0.16.1 tensorflow-io==0.34.0
%pip install -q matplotlib==3.8.3 numpy==1.24.3 pandas==2.2.1

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as tfhub
import tensorflow_io as tfio

2024-03-07 20:38:27.296356: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
@tf.function
def load_wav_16k_mono(filename):
    """
    Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio.
    """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
        file_contents,
        desired_channels=1
    )
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [6]:
yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
yamnet_model = tfhub.load(yamnet_model_handle)

class_map_path = yamnet_model.class_map_path().numpy().decode("utf-8")
class_names = list(pd.read_csv(class_map_path)["display_name"])

class_names

['Speech',
 'Child speech, kid speaking',
 'Conversation',
 'Narration, monologue',
 'Babbling',
 'Speech synthesizer',
 'Shout',
 'Bellow',
 'Whoop',
 'Yell',
 'Children shouting',
 'Screaming',
 'Whispering',
 'Laughter',
 'Baby laughter',
 'Giggle',
 'Snicker',
 'Belly laugh',
 'Chuckle, chortle',
 'Crying, sobbing',
 'Baby cry, infant cry',
 'Whimper',
 'Wail, moan',
 'Sigh',
 'Singing',
 'Choir',
 'Yodeling',
 'Chant',
 'Mantra',
 'Child singing',
 'Synthetic singing',
 'Rapping',
 'Humming',
 'Groan',
 'Grunt',
 'Whistling',
 'Breathing',
 'Wheeze',
 'Snoring',
 'Gasp',
 'Pant',
 'Snort',
 'Cough',
 'Throat clearing',
 'Sneeze',
 'Sniff',
 'Run',
 'Shuffle',
 'Walk, footsteps',
 'Chewing, mastication',
 'Biting',
 'Gargling',
 'Stomach rumble',
 'Burping, eructation',
 'Hiccup',
 'Fart',
 'Hands',
 'Finger snapping',
 'Clapping',
 'Heart sounds, heartbeat',
 'Heart murmur',
 'Cheering',
 'Applause',
 'Chatter',
 'Crowd',
 'Hubbub, speech noise, speech babble',
 'Children playing'

In [7]:
# # For some reason this would not work, so I downloaded the file locally.
# _ = tf.keras.utils.get_file(
#   'esc-50.zip',
#   'https://github.com/karoldvl/ESC-50/archive/master.zip',
#   cache_dir='./',
#   cache_subdir='datasets',
#   extract=True
# )
esc50_csv = '/Users/ramon/Downloads/ESC-50-master/meta/esc50.csv'
base_data_path = '/Users/ramon/Downloads/ESC-50-master/audio/'

pd_data = pd.read_csv(esc50_csv)

cat_label = pd_data['category'].apply(lambda name: 1 if name == "cat" else 0)
pd_data = pd_data.assign(cat_label=cat_label)

full_path = pd_data['filename'].apply(lambda row: os.path.join(base_data_path, row))
pd_data = pd_data.assign(filename=full_path)

filenames = pd_data['filename']
cat_labels = pd_data['cat_label']
folds = pd_data['fold']

main_ds = tf.data.Dataset.from_tensor_slices((filenames, cat_labels, folds))
main_ds.element_spec

def load_wav_for_map(filename, label, fold):
  return load_wav_16k_mono(filename), label, fold

main_ds = main_ds.map(load_wav_for_map)
print(f"Element spec: {main_ds.element_spec}")

def extract_embedding(wav_data, label, fold):
  scores, embeddings, spectogram = yamnet_model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (
    embeddings,
    tf.repeat(label, num_embeddings),
    tf.repeat(fold, num_embeddings)
  )

main_ds = main_ds.map(extract_embedding).unbatch()
print(f"Element spec: {main_ds.element_spec}")

cached_ds = main_ds.cache()

2024-03-07 20:38:34.801277: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX AVX2 FMA






Element spec: (TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))
Element spec: (TensorSpec(shape=(1024,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [8]:
train_ds = cached_ds.filter(lambda embedding, label, fold: fold < 4)
val_ds = cached_ds.filter(lambda embedding, label, fold: fold == 4)
test_ds = cached_ds.filter(lambda embedding, label, fold: fold == 5)

remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds = train_ds.map(remove_fold_column)
val_ds = val_ds.map(remove_fold_column)
test_ds = test_ds.map(remove_fold_column)

cats_ds = train_ds.filter(lambda waveform, label: label == True)
train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

In [9]:
train_ds.element_spec

(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None),
 TensorSpec(shape=(None,), dtype=tf.int64, name=None))

In [12]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32, name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1)
], name='my_model')

my_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True
)

history = my_model.fit(
    train_ds,
    epochs=20,
    validation_data=val_ds,
    callbacks=callback
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


In [13]:
loss, accuracy = my_model.evaluate(test_ds)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Loss: 0.08446495980024338
Accuracy: 0.9807500243186951


In [14]:
iter_cats = pd_data[pd_data.category == "cat"].iterrows()
cat_waveform = load_wav_16k_mono(next(iter_cats)[1]['filename'])
display.Audio(cat_waveform, rate=16000)





In [15]:
iter_non_cats = pd_data[pd_data.category != "cat"].iterrows()
non_cat_waveform = load_wav_16k_mono(next(iter_non_cats)[1]['filename'])
display.Audio(non_cat_waveform, rate=16000)





In [16]:
scores, embeddings, spectrogram = yamnet_model(cat_waveform)
result = my_model(embeddings).numpy()

print(f"Result shape {result.shape}")
print(result)

cat_probability = result.mean(axis=0)[0]
print(f"Cat probability: {cat_probability}")

print(f"Is cat? {cat_probability > 0}")

Result shape (10, 1)
[[21.02313  ]
 [ 7.0256567]
 [ 3.6303647]
 [ 7.44312  ]
 [21.550743 ]
 [18.751553 ]
 [28.187178 ]
 [18.079542 ]
 [22.401886 ]
 [ 4.666168 ]]
Cat probability: 15.275934219360352
Is cat? True


In [17]:
scores, embeddings, spectrogram = yamnet_model(non_cat_waveform)
result = my_model(embeddings).numpy()

print(f"Result shape {result.shape}")
print(result)

cat_probability = result.mean(axis=0)[0]
print(f"Cat probability: {cat_probability}")

print(f"Is cat? {cat_probability > 0}")

Result shape (10, 1)
[[ -2.6488411]
 [ -2.6488411]
 [ -2.6488411]
 [-10.00858  ]
 [ -8.250362 ]
 [ -3.3406227]
 [ -2.6488411]
 [ -2.6488411]
 [ -2.6488411]
 [ -2.6488411]]
Cat probability: -4.014145851135254
Is cat? False
