In [1]:
from torchvggish import vggish, vggish_input

def vggish_model(segment_np, embedding_model):
    # input must be numpy array with sr=16000Hz, dtype=int16 and range in [-1.0, +1.0]
    segment_preprocessed = vggish_input.waveform_to_examples(segment_np, 16000, True)
    embeddings = embedding_model.forward(segment_preprocessed)
    embeddings = embeddings.detach().numpy()
    if len(embeddings.shape) > 1:
        embeddings = np.mean(embeddings, axis=0)
    return embeddings

In [2]:
# Add the parent directory to the Python path
import sys
import os
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Import loss function
from texstat.functions import *
from texstat.fad import *
import texstat.torch_filterbanks.filterbanks as fb

# Other imports
import torch
import torchaudio
import librosa
import soundfile as sf
import resampy
import numpy as np
import pickle
import itertools
from scipy.linalg import sqrtm

# Full dataset embedding computation
def process_and_save_embeddings(root_folder, feature_extractor, output_folder, segment_size, sample_rate, *model_args, **model_kwargs):
    """
    Processes all subfolders in the root folder, extracting embeddings for each and saving them as pickle files.
    
    Parameters:
        root_folder (str): Path to the root folder containing subfolders with audio files.
        feature_extractor (callable): Function to extract embeddings from audio files.
        output_folder (str): Path where the pickle files should be saved.
        *model_args: Additional positional arguments for the feature extractor.
        **model_kwargs: Additional keyword arguments for the feature extractor.
    """
    print(f"Processing root folder: {root_folder}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for subfolder in os.listdir(root_folder):
        subfolder_path = os.path.join(root_folder, subfolder)
        if os.path.isdir(subfolder_path):  # Ensure it's a directory
            embeddings = extract_embeddings_from_folder(subfolder_path, feature_extractor, segment_size, sample_rate, *model_args, **model_kwargs)
            output_file = os.path.join(output_folder, "features_"+f"{subfolder}.pkl")
            with open(output_file, "wb") as f:
                pickle.dump(embeddings, f)
            print(f"Saved embeddings for {subfolder} to {output_file}")

def compute_fad(embeddings_real, embeddings_fake):
    mu_real, sigma_real = np.mean(embeddings_real, axis=0), np.cov(embeddings_real, rowvar=False)
    mu_fake, sigma_fake = np.mean(embeddings_fake, axis=0), np.cov(embeddings_fake, rowvar=False)
    diff = mu_real - mu_fake
    sigma_mean = (sigma_real + sigma_fake) / 2
    fad = np.trace(sigma_real + sigma_fake - 2 * sqrtm(sigma_mean)) + np.dot(diff, diff)
    return np.real(fad)

def load_embeddings(pkl_path):
    """Load embeddings from a .pkl file."""
    with open(pkl_path, "rb") as f:
        return pickle.load(f)  # Assuming stored as a NumPy array

def compute_fad_for_all_pairs(folder_path):
    """Computes FAD for all pairs of .pkl files in a folder and prints results in a table format."""
    # Get all .pkl files in the folder
    pkl_files = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]

    # Store results in a list
    results = []

    # Compute FAD for all unique pairs
    for pkl1, pkl2 in itertools.combinations(pkl_files, 2):
        path1 = os.path.join(folder_path, pkl1)
        path2 = os.path.join(folder_path, pkl2)

        embeddings_1 = load_embeddings(path1)
        embeddings_2 = load_embeddings(path2)

        fad_score = compute_fad(embeddings_1, embeddings_2)
        results.append((pkl1, pkl2, fad_score))

    # Normalize FAD scores
    fad_scores = [score for _, _, score in results]
    max_fad = max(fad_scores)
    fad_scores = [score / max_fad for score in fad_scores]
    results = [(file1, file2, score) for (file1, file2, score), score in zip(results, fad_scores)]

    # Print results as a formatted table
    print("\nComputed FAD Scores:")
    print("=" * 50)
    print(f"{'File 1':<20} {'File 2':<20} {'FAD Score'}")
    print("=" * 50)
    
    for file1, file2, score in results:
        print(f"{file1:<20} {file2:<20} {score:.5f}")
    
    return results  # Return results as a list of tuples

In [3]:
# Texstat model parameters
sr            = 44100
frame_size    = 2**16
N_filter_bank = 16
M_filter_bank = 6
N_moments     = 4
alpha         = torch.tensor([100, 1, 1/10, 1/100])
beta          = torch.tensor([1, 1, 1, 1, 1])
new_sr, new_frame_size = sr // 4, frame_size // 4 # for downsampler
downsampler = torchaudio.transforms.Resample(sr, new_sr)
coch_fb     = fb.EqualRectangularBandwidth(frame_size, sr, N_filter_bank, 20, sr // 2)
mod_fb      = fb.Logarithmic(new_frame_size,       new_sr, M_filter_bank, 10, new_sr // 4)

# Texstat embeddings computing
dataset_path = "../../datasets/hand_curated/"

# Save embeddings of the dataset
process_and_save_embeddings(root_folder=dataset_path, 
                            feature_extractor=stats_model, 
                            output_folder="hand_curated_texstat_embeddings/", 
                            segment_size=frame_size,
                            sample_rate=sr,
                            coch_fb=coch_fb,
                            mod_fb=mod_fb,
                            downsampler=downsampler,
                            N_moments=N_moments,
                            alpha=alpha)

Processing root folder: ../../datasets/hand_curated/
Processing folder: keyboard
    Segmentating file: keyboard.wav
        Computing feature vector for 123 segments.
Processed 123 files in ../../datasets/hand_curated/keyboard
Saved embeddings for keyboard to hand_curated_texstat_embeddings/features_keyboard.pkl
Processing folder: water
    Segmentating file: water.wav
        Computing feature vector for 32 segments.
Processed 32 files in ../../datasets/hand_curated/water
Saved embeddings for water to hand_curated_texstat_embeddings/features_water.pkl
Processing folder: wind
    Segmentating file: wind.wav
        Computing feature vector for 30 segments.
Processed 30 files in ../../datasets/hand_curated/wind
Saved embeddings for wind to hand_curated_texstat_embeddings/features_wind.pkl
Processing folder: fire
    Segmentating file: fire.wav
        Computing feature vector for 54 segments.
Processed 54 files in ../../datasets/hand_curated/fire
Saved embeddings for fire to hand_curat

In [4]:
embeddings_folder = "hand_curated_texstat_embeddings/"
fad_results = compute_fad_for_all_pairs(embeddings_folder)



Computed FAD Scores:
File 1               File 2               FAD Score
features_water.pkl   features_bubbles.pkl 0.26814
features_water.pkl   features_wind.pkl    0.07831
features_water.pkl   features_keyboard.pkl 0.70362
features_water.pkl   features_rain.pkl    0.06422
features_water.pkl   features_fire.pkl    1.00000
features_bubbles.pkl features_wind.pkl    0.20568
features_bubbles.pkl features_keyboard.pkl 0.24450
features_bubbles.pkl features_rain.pkl    0.14598
features_bubbles.pkl features_fire.pkl    0.67877
features_wind.pkl    features_keyboard.pkl 0.60322
features_wind.pkl    features_rain.pkl    0.01288
features_wind.pkl    features_fire.pkl    0.89114
features_keyboard.pkl features_rain.pkl    0.54225
features_keyboard.pkl features_fire.pkl    0.49135
features_rain.pkl    features_fire.pkl    0.86702


In [5]:
# VGGish model parameters
sr            = 16000
frame_size    = int(16000 * 2**16/44100)
embedding_model = vggish()
embedding_model.eval()

# Dataset path
dataset_path = "../../datasets/hand_curated/"

# Save embeddings of the dataset
process_and_save_embeddings(root_folder=dataset_path, 
                            feature_extractor=vggish_model, 
                            output_folder="hand_curated_vggish_embeddings/", 
                            segment_size=frame_size,
                            sample_rate=sr,
                            embedding_model=embedding_model)

Processing root folder: ../../datasets/hand_curated/
Processing folder: keyboard
    Segmentating file: keyboard.wav
        Computing feature vector for 123 segments.
Processed 123 files in ../../datasets/hand_curated/keyboard
Saved embeddings for keyboard to hand_curated_vggish_embeddings/features_keyboard.pkl
Processing folder: water
    Segmentating file: water.wav
        Computing feature vector for 32 segments.
Processed 32 files in ../../datasets/hand_curated/water
Saved embeddings for water to hand_curated_vggish_embeddings/features_water.pkl
Processing folder: wind
    Segmentating file: wind.wav
        Computing feature vector for 30 segments.
Processed 30 files in ../../datasets/hand_curated/wind
Saved embeddings for wind to hand_curated_vggish_embeddings/features_wind.pkl
Processing folder: fire
    Segmentating file: fire.wav
        Computing feature vector for 54 segments.
Processed 54 files in ../../datasets/hand_curated/fire
Saved embeddings for fire to hand_curated_

In [6]:
embeddings_folder = "hand_curated_vggish_embeddings/"
fad_results = compute_fad_for_all_pairs(embeddings_folder)


Computed FAD Scores:
File 1               File 2               FAD Score
features_water.pkl   features_bubbles.pkl 0.76738
features_water.pkl   features_wind.pkl    0.68029
features_water.pkl   features_keyboard.pkl 0.91105
features_water.pkl   features_rain.pkl    0.71725
features_water.pkl   features_fire.pkl    0.84559
features_bubbles.pkl features_wind.pkl    1.00000
features_bubbles.pkl features_keyboard.pkl 0.98165
features_bubbles.pkl features_rain.pkl    0.85594
features_bubbles.pkl features_fire.pkl    0.94887
features_wind.pkl    features_keyboard.pkl 0.61056
features_wind.pkl    features_rain.pkl    0.62012
features_wind.pkl    features_fire.pkl    0.46734
features_keyboard.pkl features_rain.pkl    0.56047
features_keyboard.pkl features_fire.pkl    0.58448
features_rain.pkl    features_fire.pkl    0.53747
