In [1]:
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

2023-02-21 13:47:30.121318: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 13:47:34.788172: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# scan data directories
import glob

nature_file_list = []
nature_file_list += glob.glob('../data/interim/AmbisonicSoundLibrary/nature/*')
nature_file_list += glob.glob('../data/interim/GoogleAudioSet/Outside, rural or natural/*')
nature_file_list += glob.glob('../data/interim/youtube/NatureSoundscapes/*')
nature_file_list += glob.glob('../data/interim/youtube/NomadicAmbience_nature/*')
nature_file_list += glob.glob('../data/interim/S2L_LULC/non_urban/*')
nature_file_list += glob.glob('../data/interim/S2L_LULC/urban_0_25/*')

city_file_list = []
city_file_list += glob.glob('../data/interim/GoogleAudioSet/Outside, urban or manmade/*')
city_file_list += glob.glob('../data/interim/youtube/NomadicAmbience_city/*')
city_file_list += glob.glob('../data/interim/SONYC/**/*.pkl')
city_file_list += glob.glob('../data/interim/S2L_LULC/urban_26_100/*')

nature_source_list = ['nature_'+i.rsplit('/', 3)[1]+'/'+i.rsplit('/', 3)[2] for i in nature_file_list]
city_source_list = ['city_'+i.rsplit('/', -1)[3] for i in city_file_list]

In [3]:
nature_df = pd.DataFrame({'file': nature_file_list, 'source': nature_source_list, 'category': 0})
city_df = pd.DataFrame({'file': city_file_list, 'source': city_source_list, 'category': 1})
df_all = pd.concat([nature_df, city_df], ignore_index=True)
df_all

Unnamed: 0,file,source,category
0,../data/interim/AmbisonicSoundLibrary/nature/W...,nature_AmbisonicSoundLibrary/nature,0
1,../data/interim/AmbisonicSoundLibrary/nature/R...,nature_AmbisonicSoundLibrary/nature,0
2,../data/interim/AmbisonicSoundLibrary/nature/A...,nature_AmbisonicSoundLibrary/nature,0
3,../data/interim/AmbisonicSoundLibrary/nature/W...,nature_AmbisonicSoundLibrary/nature,0
4,../data/interim/AmbisonicSoundLibrary/nature/L...,nature_AmbisonicSoundLibrary/nature,0
...,...,...,...
1919,../data/interim/S2L_LULC/urban_26_100/s2lam111...,city_S2L_LULC,1
1920,../data/interim/S2L_LULC/urban_26_100/s2lam006...,city_S2L_LULC,1
1921,../data/interim/S2L_LULC/urban_26_100/s2lam083...,city_S2L_LULC,1
1922,../data/interim/S2L_LULC/urban_26_100/s2lam052...,city_S2L_LULC,1


In [4]:
from sklearn.model_selection import StratifiedKFold

# Split the data into folds using StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=23)
for fold, (train_idx, val_idx) in enumerate(skf.split(df_all, df_all['source'])):
    # Assign the fold number to each row in the DataFrame
    df_all.loc[val_idx, 'fold'] = fold
    
df_all['fold'] = df_all['fold'].astype('int')
df_all = df_all.sample(frac=1, random_state=23).reset_index(drop=True) # need to shuffle the rows before deep learning
df_all

In [5]:
df_all.to_csv('')

Unnamed: 0,file,source,category,fold
0,../data/interim/S2L_LULC/urban_0_25/s2llg003_1...,nature_S2L_LULC/urban_0_25,0,9
1,"../data/interim/GoogleAudioSet/Outside, urban ...",city_GoogleAudioSet,1,8
2,../data/interim/SONYC/audio-12/34_018803.pkl,city_SONYC,1,7
3,"../data/interim/GoogleAudioSet/Outside, urban ...",city_GoogleAudioSet,1,4
4,"../data/interim/GoogleAudioSet/Outside, urban ...",city_GoogleAudioSet,1,0
...,...,...,...,...
1919,../data/interim/SONYC/audio-4/22_004702.pkl,city_SONYC,1,1
1920,../data/interim/S2L_LULC/urban_0_25/s2lam042_1...,nature_S2L_LULC/urban_0_25,0,2
1921,"../data/interim/GoogleAudioSet/Outside, urban ...",city_GoogleAudioSet,1,1
1922,../data/interim/S2L_LULC/urban_0_25/s2llg001_1...,nature_S2L_LULC/urban_0_25,0,0


# Convert data into TF format

In [6]:
filenames = df_all['file']
targets = df_all['category']
folds = df_all['fold']

main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets, folds))
main_ds.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [7]:
def load_wav_pkl(filename, wav_label='y'):
    import pickle
    # open a file, where you stored the pickled data
    file = open(filename, 'rb')

    # dump information to that file
    output = pickle.load(file)
    wav = output[wav_label]

    # close the file
    file.close()
    return wav

wav_list = []
for index, row in df_all.iterrows():
    if len(load_wav_pkl(row['file'])) == 160000:
        wav_list.append(load_wav_pkl(row['file']))
    else: # if the waveform is shorter (for unknown reason)
        temp_wav = load_wav_pkl(row['file'])
        print('short length: '+str(len(temp_wav)))
        wav_list.append(np.pad(temp_wav, (0,160000-len(temp_wav)),'mean')) # zero-padding at the end to 160000

wav_bg_list = []
for index, row in df_all.iterrows():
    if len(load_wav_pkl(row['file'], 'bg_y')) == 160000:
        wav_bg_list.append(load_wav_pkl(row['file'], 'bg_y'))
    else: # if the waveform is shorter (for unknown reason)
        temp_wav = load_wav_pkl(row['file'], 'bg_y')
#         print('short length: '+str(len(temp_wav)))
        wav_bg_list.append(np.pad(temp_wav, (0,160000-len(temp_wav)),'mean')) # zero-padding at the end to 160000

wav_fg_list = []
for index, row in df_all.iterrows():
    if len(load_wav_pkl(row['file'], 'fg_y')) == 160000:
        wav_fg_list.append(load_wav_pkl(row['file'], 'fg_y'))
    else: # if the waveform is shorter (for unknown reason)
        temp_wav = load_wav_pkl(row['file'], 'fg_y')
#         print('short length: '+str(len(temp_wav)))
        wav_fg_list.append(np.pad(temp_wav, (0,160000-len(temp_wav)),'mean')) # zero-padding at the end to 160000

short length: 159999
short length: 159880
short length: 146099
short length: 159880
short length: 159880
short length: 151683
short length: 159880
short length: 153357
short length: 153242
short length: 157848
short length: 159992
short length: 151461
short length: 148006
short length: 159997
short length: 156480
short length: 159993


In [8]:
main_ds = tf.data.Dataset.from_tensor_slices((np.stack(wav_list, axis = 0), df_all['category'], df_all['fold']))
main_ds_fg = tf.data.Dataset.from_tensor_slices((np.stack(wav_fg_list, axis = 0), df_all['category'], df_all['fold']))
main_ds_bg = tf.data.Dataset.from_tensor_slices((np.stack(wav_bg_list, axis = 0), df_all['category'], df_all['fold']))

main_ds.element_spec

(TensorSpec(shape=(160000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [9]:
# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label, fold):
    # run YAMNet to extract embedding from the wav data
    scores, embeddings, spectrogram = yamnet_model(wav_data)
    num_embeddings = tf.shape(embeddings)[0]
    return (embeddings,
            tf.repeat(label, num_embeddings),
            tf.repeat(fold, num_embeddings))

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
main_ds_fg = main_ds_fg.map(extract_embedding).unbatch()
main_ds_bg = main_ds_bg.map(extract_embedding).unbatch()

main_ds.element_spec

(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [10]:
main_ds_all3 = tf.data.Dataset.from_tensor_slices((np.stack(wav_list, axis = 0), np.stack(wav_bg_list, axis = 0), np.stack(wav_fg_list, axis = 0), df_all['category'], df_all['fold']))
main_ds_all3.element_spec

(TensorSpec(shape=(160000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(160000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(160000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [11]:
# applies the embedding extraction model to a wav data
def extract_embedding_3(wav_data_raw, wav_data_bg, wav_data_fg, label, fold):
    # run YAMNet to extract embedding from the wav data
    scores, embeddings_raw, spectrogram = yamnet_model(wav_data_raw)
    scores, embeddings_bg, spectrogram = yamnet_model(wav_data_bg)
    scores, embeddings_fg, spectrogram = yamnet_model(wav_data_fg)
    num_embeddings_raw = tf.shape(embeddings_raw)[0]
    return (tf.concat([embeddings_raw, embeddings_bg, embeddings_fg],1),
            tf.repeat(label, num_embeddings_raw),
            tf.repeat(fold, num_embeddings_raw))

# extract embedding
main_ds_3 = main_ds_all3.map(extract_embedding_3).unbatch()

main_ds_3.element_spec

(TensorSpec(shape=(3072,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [12]:
## raw signal
cached_ds = main_ds.cache()
train_ds = cached_ds.filter(lambda embedding, label, fold: fold < 8)
val_ds = cached_ds.filter(lambda embedding, label, fold: fold == 8)
test_ds = cached_ds.filter(lambda embedding, label, fold: fold == 9)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds = train_ds.map(remove_fold_column)
val_ds = val_ds.map(remove_fold_column)
test_ds = test_ds.map(remove_fold_column)

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [13]:
## background sound
cached_ds_bg = main_ds_bg.cache()
train_ds_bg = cached_ds_bg.filter(lambda embedding, label, fold: fold < 8)
val_ds_bg = cached_ds_bg.filter(lambda embedding, label, fold: fold == 8)
test_ds_bg = cached_ds_bg.filter(lambda embedding, label, fold: fold == 9)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds_bg = train_ds_bg.map(remove_fold_column)
val_ds_bg = val_ds_bg.map(remove_fold_column)
test_ds_bg = test_ds_bg.map(remove_fold_column)

train_ds_bg = train_ds_bg.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds_bg = val_ds_bg.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds_bg = test_ds_bg.cache().batch(32).prefetch(tf.data.AUTOTUNE)

In [14]:
## foreground sound
cached_ds_fg = main_ds_fg.cache()
train_ds_fg = cached_ds_fg.filter(lambda embedding, label, fold: fold < 8)
val_ds_fg = cached_ds_fg.filter(lambda embedding, label, fold: fold == 8)
test_ds_fg = cached_ds_fg.filter(lambda embedding, label, fold: fold == 9)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds_fg = train_ds_fg.map(remove_fold_column)
val_ds_fg = val_ds_fg.map(remove_fold_column)
test_ds_fg = test_ds_fg.map(remove_fold_column)

train_ds_fg = train_ds_fg.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds_fg = val_ds_fg.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds_fg = test_ds_fg.cache().batch(32).prefetch(tf.data.AUTOTUNE)

In [15]:
## all 3 signals
cached_ds_3 = main_ds_3.cache()
train_ds_3 = cached_ds_3.filter(lambda embedding, label, fold: fold < 8)
val_ds_3 = cached_ds_3.filter(lambda embedding, label, fold: fold == 8)
test_ds_3 = cached_ds_3.filter(lambda embedding, label, fold: fold == 9)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds_3 = train_ds_3.map(remove_fold_column)
val_ds_3 = val_ds_3.map(remove_fold_column)
test_ds_3 = test_ds_3.map(remove_fold_column)

train_ds_3 = train_ds_3.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds_3 = val_ds_3.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds_3 = test_ds_3.cache().batch(32).prefetch(tf.data.AUTOTUNE)

# Model of raw signal

In [13]:
my_classes = ['city', 'nature']
raw_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='raw_model')

raw_model.summary()

Model: "raw_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               524800    
                                                                 
 dense_1 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 525,826
Trainable params: 525,826
Non-trainable params: 0
_________________________________________________________________


In [14]:
raw_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    optimizer="adam",
                    metrics=['accuracy'])

# callback will be used in the other models below too
callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

In [15]:
history = raw_model.fit(train_ds,
                       epochs=20,
                       validation_data=val_ds,
                       callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
loss, accuracy = raw_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  1.2211443185806274
Accuracy:  0.76953125


# Model of background signal

In [22]:
my_classes = ['city', 'nature']
bg_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='bg_model')

bg_model.summary()

Model: "bg_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               524800    
                                                                 
 dense_7 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 525,826
Trainable params: 525,826
Non-trainable params: 0
_________________________________________________________________


In [23]:
bg_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    optimizer="adam",
                    metrics=['accuracy'])

history = bg_model.fit(train_ds_bg,
                       epochs=20,
                       validation_data=val_ds_bg,
                       callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
loss, accuracy = bg_model.evaluate(test_ds_bg)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  1.1917799711227417
Accuracy:  0.7671874761581421


# Model of foreground signal

In [25]:
my_classes = ['city', 'nature']
fg_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='fg_model')

fg_model.summary()

Model: "fg_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 512)               524800    
                                                                 
 dense_9 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 525,826
Trainable params: 525,826
Non-trainable params: 0
_________________________________________________________________


In [26]:
fg_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    optimizer="adam",
                    metrics=['accuracy'])

history = fg_model.fit(train_ds_fg,
                       epochs=20,
                       validation_data=val_ds_fg,
                       callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
loss, accuracy = fg_model.evaluate(test_ds_fg)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.9817324876785278
Accuracy:  0.6583333611488342


# Model of 3 signals

In [27]:
my_classes = ['city', 'nature']
tf.keras.backend.clear_session()
all3_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(3072), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(1024, activation='relu'),
#     tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='all3_model')

all3_model.summary()

Model: "all3_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              3146752   
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 3,672,578
Trainable params: 3,672,578
Non-trainable params: 0
_________________________________________________________________


In [28]:
all3_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    optimizer="adam",
                    metrics=['accuracy'])

# callback will be used in the other models below too
callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)


history = all3_model.fit(train_ds_3,
                       epochs=20,
                       validation_data=val_ds_3,
                       callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
loss, accuracy = all3_model.evaluate(test_ds_3)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  1.827581524848938
Accuracy:  0.7533854246139526
