In [1]:
import os

from IPython import display
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as python_random

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import regularizers

from lazypredict.Supervised import LazyClassifier

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle) # if there's an "SavedModel file does not exist at:", delete that folder and rerun it


2023-03-03 00:25:22.125133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-03 00:25:27.350246: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_all = pd.read_csv('../train_val_test_split/train_val_test_GoogleAudioSet.csv', index_col=0)
df_all

Unnamed: 0,file,source,category,weight,fold
0,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,0
1,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8
2,../data/interim/GoogleAudioSet_unbalanced_list...,Google_nature,0,1,5
3,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,1
4,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,1
...,...,...,...,...,...
13662,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,5
13663,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,3
13664,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8
13665,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8


In [3]:
# check the weight
df_all.groupby(['source'])['weight'].sum()

source
Google_city      6687
Google_nature    6980
Name: weight, dtype: int64

In [4]:
def extract_yamnet_embedding(filename):
    import pickle
    
    file = open(filename, 'rb')
    output = pickle.load(file)
#     output = pd.read_pickle(file)
    wav_raw = output['y']
    wav_bg = output['bg_y']
    wav_fg = output['fg_y']
    file.close()

# # no need to run padding as the time dimension will be collapsed after transforming to embeddings anyway    
#     if len(wav_raw) < 160000:
#         wav_raw = np.pad(wav_raw, (0,160000-len(wav_raw)),'mean')

#     if len(wav_bg) < 160000:
#         wav_bg = np.pad(wav_bg, (0,160000-len(wav_bg)),'mean')

#     if len(wav_fg) < 160000:
#         wav_fg = np.pad(wav_fg, (0,160000-len(wav_fg)),'mean')

    # Extract YAMNet embeddings for each frame
    scores, embedding_tensor_raw, spectrogram = yamnet_model(wav_raw)
    embedding_tensor_raw = tf.reduce_mean(embedding_tensor_raw, axis=0).numpy()

    scores, embedding_tensor_bg, spectrogram = yamnet_model(wav_bg)
    embedding_tensor_bg = tf.reduce_mean(embedding_tensor_bg, axis=0).numpy()

    scores, embedding_tensor_fg, spectrogram = yamnet_model(wav_fg)
    embedding_tensor_fg = tf.reduce_mean(embedding_tensor_fg, axis=0).numpy()
        
    
    return embedding_tensor_raw, embedding_tensor_bg, embedding_tensor_fg

In [6]:
from joblib import Parallel, delayed

embeddings_raw_list = []
embeddings_bg_list = []
embeddings_fg_list = []
import time
start_time = time.time()


for index, row in df_all.iterrows():
    embedding_tensor_raw, embedding_tensor_bg, embedding_tensor_fg = extract_yamnet_embedding(row['file'])
    
    embeddings_raw_list.append(embedding_tensor_raw)
    embeddings_bg_list.append(embedding_tensor_bg)
    embeddings_fg_list.append(embedding_tensor_fg)
    
embeddings_raw_matrix = np.stack(embeddings_raw_list, axis=0)
embeddings_bg_matrix = np.stack(embeddings_bg_list, axis=0)
embeddings_fg_matrix = np.stack(embeddings_fg_list, axis=0)
    
print('seconds: '+str(time.time()-start_time))


seconds: 2662.80260515213


In [12]:
embeddings_matrix = np.concatenate((embeddings_raw_matrix, embeddings_bg_matrix, embeddings_fg_matrix), axis=1)

In [14]:
train_index = df_all['fold']<8
valid_index = df_all['fold']==8
test_index = df_all['fold']==9

# Raw signal

In [None]:
X_train = embeddings_raw_matrix[train_index,:]
y_train = df_all.loc[train_index,'category']
X_valid = embeddings_raw_matrix[valid_index,:]
y_valid = df_all.loc[valid_index,'category']

transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
      ,('scaler', StandardScaler())
])

transformer.fit(X_train)
X_train = transformer.transform(X_train)
X_valid = transformer.transform(X_valid)


clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_valid, y_train, y_valid)
models

# Background signal

In [None]:
X_train = embeddings_bg_matrix[train_index,:]
y_train = df_all.loc[train_index,'category']
X_valid = embeddings_bg_matrix[valid_index,:]
y_valid = df_all.loc[valid_index,'category']

transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
      ,('scaler', StandardScaler())
])

transformer.fit(X_train)
X_train = transformer.transform(X_train)
X_valid = transformer.transform(X_valid)


clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_valid, y_train, y_valid)
models

# Foreground signal

In [None]:
X_train = embeddings_fg_matrix[train_index,:]
y_train = df_all.loc[train_index,'category']
X_valid = embeddings_fg_matrix[valid_index,:]
y_valid = df_all.loc[valid_index,'category']

transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
      ,('scaler', StandardScaler())
])

transformer.fit(X_train)
X_train = transformer.transform(X_train)
X_valid = transformer.transform(X_valid)


clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_valid, y_train, y_valid)
models

# All signal

In [15]:
X_train = embeddings_matrix[train_index,:]
y_train = df_all.loc[train_index,'category']
X_valid = embeddings_matrix[valid_index,:]
y_valid = df_all.loc[valid_index,'category']

transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
      ,('scaler', StandardScaler())
])

transformer.fit(X_train)
X_train = transformer.transform(X_train)
X_valid = transformer.transform(X_valid)


clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_valid, y_train, y_valid)
models


100%|███████████████████████████████████████████| 29/29 [28:16<00:00, 58.49s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.7,0.7,0.7,0.7,209.6
NuSVC,0.7,0.7,0.7,0.7,350.83
ExtraTreesClassifier,0.7,0.7,0.7,0.7,11.98
LGBMClassifier,0.69,0.69,0.69,0.69,14.68
RandomForestClassifier,0.69,0.69,0.69,0.69,34.83
RidgeClassifierCV,0.69,0.69,0.69,0.69,22.95
LogisticRegression,0.68,0.68,0.68,0.68,2.72
XGBClassifier,0.68,0.68,0.68,0.68,91.35
AdaBoostClassifier,0.68,0.68,0.68,0.68,127.93
RidgeClassifier,0.67,0.67,0.67,0.67,12.61


In [None]:
  
    
    
    

# # Define a function to extract YAMNet embeddings for a single audio file
# def extract_yamnet_embedding(audio_path):
#     # Load the audio file
#     audio, sample_rate = tf.audio.decode_wav(tf.io.read_file(audio_path))
    
#     # Extract YAMNet embeddings for each frame
#     embedding_tensor = yamnet_model(audio)
#     embeddings = tf.reduce_mean(embedding_tensor, axis=0)
    
#     return embeddings

# # Define a function to parse a single metadata file
# def parse_metadata(metadata_path):
#     # Implement your metadata parsing code here
#     metadata = ...
    
#     return metadata

# # Define a function to combine YAMNet embeddings and metadata for a single audio file
# def combine_features(audio_path, metadata_path):
#     # Extract YAMNet embeddings
#     yamnet_embedding = extract_yamnet_embedding(audio_path)
    
#     # Parse metadata
#     metadata = parse_metadata(metadata_path)
    
#     # Concatenate the YAMNet embedding and metadata into a single feature vector
#     feature_vector = tf.concat([yamnet_embedding, metadata], axis=-1)
    
#     return feature_vector

# # Define a function to load and preprocess a single data sample
# def load_and_preprocess_data(audio_path, metadata_path):
#     # Combine YAMNet embeddings and metadata
#     feature_vector = combine_features(audio_path, metadata_path)
    
#     # Implement your data preprocessing code here
#     preprocessed_data = ...
    
#     return preprocessed_data

# # Load a list of audio and metadata file paths
# audio_paths = ["path/to/audio1.wav", "path/to/audio2.wav", ...]
# metadata_paths = ["path/to/metadata1.csv", "path/to/metadata2.csv", ...]

# # Create a TensorFlow dataset using the audio and metadata file paths
# dataset = tf.data.Dataset.from_tensor_slices((audio_paths, metadata_paths))

# # Use the map function to apply the load_and_preprocess_data function to each sample in the dataset
# dataset = dataset.map(load_and_preprocess_data)

# # Implement your training code here using the preprocessed dataset
# ...