# Convert audio signals to pre-trained CNN embeddings

The audio signals are being converted to pre-trained CNN embeddings, which will be used for classification tasks. This process involves extracting high-level features from the audio data using pre-trained convolutional neural network models.

Three popular pre-trained CNN models for classifying audio events have been selected:
- VGGish: https://github.com/tensorflow/models/tree/master/research/audioset/vggish
- YAMNet: https://github.com/tensorflow/models/tree/master/research/audioset/yamnet
- PANNs: https://github.com/qiuqiangkong/audioset_tagging_cnn

In [2]:
import os

from IPython import display
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as python_random

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import regularizers

from lazypredict.Supervised import LazyClassifier

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') # if there's an "SavedModel file does not exist at:", delete that folder and rerun it
vggish_model = hub.load('https://tfhub.dev/google/vggish/1')

2023-03-10 14:36:19.071527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df_all = pd.read_csv('../train_val_test_split/train_val_test_GoogleAudioSet.csv', index_col=0)
df_all

Unnamed: 0,file,source,category,weight,fold
0,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,0
1,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8
2,../data/interim/GoogleAudioSet_unbalanced_list...,Google_nature,0,1,5
3,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,1
4,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,1
...,...,...,...,...,...
13662,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,5
13663,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,3
13664,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8
13665,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8


In [4]:
def data_preprocessing(filename):
    import pickle
    
    file = open(filename, 'rb')
    output = pickle.load(file)
#     output = pd.read_pickle(file)
    wav_raw = output['y']
    wav_bg = output['bg_y']
    wav_fg = output['fg_y']
    
    df_indices_raw = output['df_indices']
    df_indices_fg = output['df_indices_fg']
    df_indices_bg = output['df_indices_bg']
    
    wt = output['wt']
    mps_raw = output['mps']
    mps_fg = output['mps_fg']
    mps_bg = output['mps_bg']
    mps_raw = mps_raw[:,wt<=100].reshape(-1) # exclude the temporal modulation frequency >100 Hz
    mps_fg = mps_fg[:,wt<=100].reshape(-1)
    mps_bg = mps_bg[:,wt<=100].reshape(-1)
    
    file.close()

    # Extract YAMNet embeddings for each frame
    scores, embedding_tensor_raw, spectrogram = yamnet_model(wav_raw)
    embedding_tensor_raw = tf.reduce_mean(embedding_tensor_raw, axis=0).numpy()

    scores, embedding_tensor_bg, spectrogram = yamnet_model(wav_bg)
    embedding_tensor_bg = tf.reduce_mean(embedding_tensor_bg, axis=0).numpy()

    scores, embedding_tensor_fg, spectrogram = yamnet_model(wav_fg)
    embedding_tensor_fg = tf.reduce_mean(embedding_tensor_fg, axis=0).numpy()
    
    
    # Extract VGGish embeddings for each frame
    vggish_embedding_raw = tf.reduce_mean(vggish_model(wav_raw), axis=0).numpy()
    vggish_embedding_bg = tf.reduce_mean(vggish_model(wav_bg), axis=0).numpy()
    vggish_embedding_fg = tf.reduce_mean(vggish_model(wav_fg), axis=0).numpy()
   
    return embedding_tensor_raw, embedding_tensor_bg, embedding_tensor_fg, mps_raw, mps_bg, mps_fg, df_indices_raw, df_indices_bg, df_indices_fg, vggish_embedding_raw, vggish_embedding_bg, vggish_embedding_fg

In [None]:
import librosa
import panns_inference
from panns_inference import AudioTagging, SoundEventDetection, labels
import pickle
from librosa import resample
import time

start_time = time.time()


wav_raw_list = []
wav_bg_list = []
wav_fg_list = []
for index, row in df_all.iterrows():
    file = open(row['file'], 'rb')
    output = pickle.load(file)
    wav_raw_list.append(np.pad(output['y'], (0,160000-len(output['y'])),'mean'))
    wav_bg_list.append(np.pad(output['bg_y'], (0,160000-len(output['bg_y'])),'mean'))
    wav_fg_list.append(np.pad(output['fg_y'], (0,160000-len(output['fg_y'])),'mean'))
    file.close()
    
wav_raw = resample(np.stack(wav_raw_list), orig_sr=16000, target_sr=32000, axis=1)
del wav_raw_list
wav_bg = resample(np.stack(wav_bg_list), orig_sr=16000, target_sr=32000, axis=1)
del wav_bg_list
wav_fg = resample(np.stack(wav_fg_list), orig_sr=16000, target_sr=32000, axis=1)
del wav_fg_list
    

n_file = 0
clipwise_output_raw_list = []
clipwise_output_bg_list = []
clipwise_output_fg_list = []
embedding_raw_list = []
embedding_bg_list = []
embedding_fg_list = []
file_step = 100

at = AudioTagging(checkpoint_path=None, device='cuda')
while n_file < len(df_all):
    print(str(n_file))
    (clipwise_output_raw, embedding_raw) = at.inference(wav_raw[n_file:min(n_file+file_step,len(df_all))])
    (clipwise_output_bg, embedding_bg) = at.inference(wav_bg[n_file:min(n_file+file_step,len(df_all))])
    (clipwise_output_fg, embedding_fg) = at.inference(wav_fg[n_file:min(n_file+file_step,len(df_all))])
    n_file += file_step
    
    clipwise_output_raw_list.append(clipwise_output_raw)
    clipwise_output_bg_list.append(clipwise_output_bg)
    clipwise_output_fg_list.append(clipwise_output_fg)
    embedding_raw_list.append(embedding_raw)
    embedding_bg_list.append(embedding_bg)
    embedding_fg_list.append(embedding_fg)

    
panns_clip_raw = np.concatenate(clipwise_output_raw_list, axis=0)
del clipwise_output_raw_list

panns_clip_bg = np.concatenate(clipwise_output_bg_list, axis=0)
del clipwise_output_bg_list

panns_clip_fg = np.concatenate(clipwise_output_fg_list, axis=0)
del clipwise_output_fg_list

panns_embedding_raw = np.concatenate(embedding_raw_list, axis=0)
del embedding_raw_list

panns_embedding_bg = np.concatenate(embedding_bg_list, axis=0)
del embedding_bg_list

panns_embedding_fg = np.concatenate(embedding_fg_list, axis=0)
del embedding_fg_list



print(time.time()-start_time)

In [6]:
np.save('../data/processed/20230304/panns_clip_raw.npy', panns_clip_raw)
np.save('../data/processed/20230304/panns_clip_bg.npy', panns_clip_bg)
np.save('../data/processed/20230304/panns_clip_fg.npy', panns_clip_fg)
np.save('../data/processed/20230304/panns_embedding_raw.npy', panns_embedding_raw)
np.save('../data/processed/20230304/panns_embedding_bg.npy', panns_embedding_bg)
np.save('../data/processed/20230304/panns_embedding_fg.npy', panns_embedding_fg)

In [4]:
embedding_raw_list = []
embedding_bg_list = []
embedding_fg_list = []
mps_raw_list = []
mps_bg_list = []
mps_fg_list = []
indices_raw_list = []
indices_bg_list = []
indices_fg_list = []
vgg_raw_list = []
vgg_bg_list = []
vgg_fg_list = []

import time
start_time = time.time()


for index, row in df_all.iterrows():
    embedding_tensor_raw, embedding_tensor_bg, embedding_tensor_fg, mps_raw, mps_bg, mps_fg, df_indices_raw, df_indices_bg, df_indices_fg, vggish_embedding_raw, vggish_embedding_bg, vggish_embedding_fg = data_preprocessing(row['file'])
    
    embedding_raw_list.append(embedding_tensor_raw)
    embedding_bg_list.append(embedding_tensor_bg)
    embedding_fg_list.append(embedding_tensor_fg)
    mps_raw_list.append(mps_raw)
    mps_bg_list.append(mps_bg)
    mps_fg_list.append(mps_fg)
    indices_raw_list.append(df_indices_raw)
    indices_bg_list.append(df_indices_bg)
    indices_fg_list.append(df_indices_fg)
    vgg_raw_list.append(vggish_embedding_raw)
    vgg_bg_list.append(vggish_embedding_bg)
    vgg_fg_list.append(vggish_embedding_fg)

    
print('seconds: '+str(time.time()-start_time))


seconds: 6113.771539211273


In [5]:
embedding_raw_matrix = np.stack(embedding_raw_list, axis=0)
embedding_bg_matrix = np.stack(embedding_bg_list, axis=0)
embedding_fg_matrix = np.stack(embedding_fg_list, axis=0)

In [6]:
df_indices_raw = pd.concat(indices_raw_list, ignore_index=True)
df_indices_bg = pd.concat(indices_bg_list, ignore_index=True)
df_indices_fg = pd.concat(indices_fg_list, ignore_index=True)

In [7]:
mps_raw_matrix = np.stack(mps_raw_list)
mps_bg_matrix = np.stack(mps_bg_list)
mps_fg_matrix = np.stack(mps_fg_list)

In [8]:
vgg_raw_matrix = np.stack(vgg_raw_list, axis=0)
vgg_bg_matrix = np.stack(vgg_bg_list, axis=0)
vgg_fg_matrix = np.stack(vgg_fg_list, axis=0)

In [9]:
np.save('../data/processed/20230304/embedding_raw_matrix.npy', embedding_raw_matrix)
np.save('../data/processed/20230304/embedding_bg_matrix.npy', embedding_bg_matrix)
np.save('../data/processed/20230304/embedding_fg_matrix.npy', embedding_fg_matrix)

np.save('../data/processed/20230304/vgg_raw_matrix.npy', vgg_raw_matrix)
np.save('../data/processed/20230304/vgg_bg_matrix.npy', vgg_bg_matrix)
np.save('../data/processed/20230304/vgg_fg_matrix.npy', vgg_fg_matrix)

np.save('../data/processed/20230304/mps_raw_matrix.npy', mps_raw_matrix)
np.save('../data/processed/20230304/mps_bg_matrix.npy', mps_bg_matrix)
np.save('../data/processed/20230304/mps_fg_matrix.npy', mps_fg_matrix)

df_indices_raw.to_csv('../data/processed/20230304/df_indices_raw.csv')
df_indices_bg.to_csv('../data/processed/20230304/df_indices_bg.csv')
df_indices_fg.to_csv('../data/processed/20230304/df_indices_fg.csv')