In [None]:
# !git clone https://github.com/cordutie/texstat.git

Clonando en 'texstat'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 88 (delta 36), reused 72 (delta 24), pack-reused 0 (from 0)[K
Recibiendo objetos: 100% (88/88), 1.05 MiB | 4.82 MiB/s, listo.
Resolviendo deltas: 100% (36/36), listo.


# FAD Resynthesis Scores

In [None]:
import sys
import os
 
# Add the parent directory to the Python path
parent_dir = os.path.abspath('texstat/')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Pairs of paths to be compared
texture_types = ["bubbles", "fire", "keyboard", "rain", "river", "shards", "waterfall", "wind"]

pairs = {}
for texture_type in texture_types:
    original_path = "evaluation_sounds_2/"+texture_type+".wav"
    # make list of all files in evaluation_sounds/ that start with texture_type
    resynth_path = "evaluation_sounds_2/"+texture_type+"_resynth.wav"
    # list of pairs [original_path, reproduction_path]
    pairs[texture_type] = []
    pairs[texture_type].append([original_path, resynth_path])

print(pairs)

{'bubbles': [['evaluation_sounds_2/bubbles.wav', 'evaluation_sounds_2/bubbles_resynth.wav']], 'fire': [['evaluation_sounds_2/fire.wav', 'evaluation_sounds_2/fire_resynth.wav']], 'keyboard': [['evaluation_sounds_2/keyboard.wav', 'evaluation_sounds_2/keyboard_resynth.wav']], 'rain': [['evaluation_sounds_2/rain.wav', 'evaluation_sounds_2/rain_resynth.wav']], 'river': [['evaluation_sounds_2/river.wav', 'evaluation_sounds_2/river_resynth.wav']], 'shards': [['evaluation_sounds_2/shards.wav', 'evaluation_sounds_2/shards_resynth.wav']], 'waterfall': [['evaluation_sounds_2/waterfall.wav', 'evaluation_sounds_2/waterfall_resynth.wav']], 'wind': [['evaluation_sounds_2/wind.wav', 'evaluation_sounds_2/wind_resynth.wav']]}


### 1. FAD using TexStat

In [3]:
from texstat.fad import *
import texstat.torch_filterbanks.filterbanks as fb
import librosa
import torchaudio

# texstat properties
sr            = 44100
frame_size    = 44100
N_filter_bank = 16
M_filter_bank = 6
new_sr, new_frame_size = sr // 4, frame_size // 4 # for downsampler
downsampler = torchaudio.transforms.Resample(sr, new_sr)
coch_fb     = fb.EqualRectangularBandwidth(frame_size, sr, N_filter_bank, 20, sr // 2)
mod_fb      = fb.Logarithmic(new_frame_size,       new_sr, M_filter_bank, 10, new_sr // 4)

for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs[texture_type]:
        original_path       = pair[0]
        original_signal     = librosa.load(original_path, sr=44100, mono=True)[0]
        reproduction_path   = pair[1]
        reproduction_signal = librosa.load(reproduction_path, sr=44100, mono=True)[0]
        # Compute the FAD
        fad = compute_fad_from_signals(original_signal, reproduction_signal, frame_size, coch_fb, mod_fb, downsampler)
        print("FAD score for pair: ", original_path, reproduction_path, "is", fad)



Texture type:  bubbles
FAD score for pair:  evaluation_sounds_2/bubbles.wav evaluation_sounds_2/bubbles_resynth.wav is 29.879322716839518
Texture type:  fire
FAD score for pair:  evaluation_sounds_2/fire.wav evaluation_sounds_2/fire_resynth.wav is 819.7767264320058
Texture type:  keyboard
FAD score for pair:  evaluation_sounds_2/keyboard.wav evaluation_sounds_2/keyboard_resynth.wav is 29385.43535900288
Texture type:  rain
FAD score for pair:  evaluation_sounds_2/rain.wav evaluation_sounds_2/rain_resynth.wav is 465.92478543620086
Texture type:  river
FAD score for pair:  evaluation_sounds_2/river.wav evaluation_sounds_2/river_resynth.wav is -13.280245193785348
Texture type:  shards
FAD score for pair:  evaluation_sounds_2/shards.wav evaluation_sounds_2/shards_resynth.wav is 86.80855199843276
Texture type:  waterfall
FAD score for pair:  evaluation_sounds_2/waterfall.wav evaluation_sounds_2/waterfall_resynth.wav is -18.609646299232583
Texture type:  wind
FAD score for pair:  evaluation_s

### 2. FAD using VGGish

In [None]:
# !git clone https://github.com/gudgud96/frechet-audio-distance.git
# pip install -r frechet-audio-distance/requirements.txt
## rename the folder frechet-audio-distance to fad_repo
# mv frechet-audio-distance fad_repo


Clonando en 'frechet-audio-distance'...
remote: Enumerating objects: 165, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 165 (delta 46), reused 30 (delta 29), pack-reused 103 (from 1)[K
Recibiendo objetos: 100% (165/165), 1.95 MiB | 6.73 MiB/s, listo.
Resolviendo deltas: 100% (72/72), listo.


In [3]:
# from torchvggish import vggish, vggish_input

# # Initialise model and download weights
# embedding_model = vggish()
# embedding_model.eval()

# for texture_type in texture_types:
#     print("Texture type: ", texture_type)
#     for pair in pairs[texture_type]:
#         original_path       = pair[0]
#         original_preprocessed = vggish_input.wavfile_to_examples(original_path)
#         original_embeddings   = embedding_model.forward(original_preprocessed).detach().numpy()
#         reproduction_path   = pair[1]
#         reproduction_preprocessed = vggish_input.wavfile_to_examples(reproduction_path)
#         reproduction_embeddings = embedding_model.forward(reproduction_preprocessed).detach().numpy()
#         # Compute the FAD
#         fad = compute_fad_from_embeddings(original_embeddings, reproduction_embeddings)
#         print("FAD score for pair: ", original_path, reproduction_path, "is", fad)

from fad_repo.frechet_audio_distance.fad import FrechetAudioDistance
import os
import shutil

def create_folder_from_wav(wav_path):
    # Get the name of the wav file without the extension
    wav_name = os.path.splitext(os.path.basename(wav_path))[0]
    # Get the directory where the wav file is located
    base_dir = os.path.dirname(wav_path)
    # Full path to the new folder
    folder_path = os.path.join(base_dir, wav_name)
    
    # Check if the folder already exists
    if os.path.exists(folder_path):
        print(f"Folder '{folder_path}' already exists.")
    else:
        print(f"Creating folder '{folder_path}'...")
        os.makedirs(folder_path)

    # Destination path for the copied wav file
    dest_wav_path = os.path.join(folder_path, os.path.basename(wav_path))
    
    # Copy the wav file to the new folder
    shutil.copy2(wav_path, dest_wav_path)
    print(f"Copied '{wav_path}' to '{dest_wav_path}'")

    return folder_path

# to use `vggish`
frechet = FrechetAudioDistance(
    model_name="vggish",
    sample_rate=16000,
    use_pca=False, 
    use_activation=False,
    verbose=False
)

def fad_evaluation(wav_path_1, wav_path_2, frechet):
    # Create a folder for the wav file
    path_1 = create_folder_from_wav(wav_path_1)
    path_2 = create_folder_from_wav(wav_path_2)
    
    # Compute the FAD score
    fad_score = frechet.score(
        path_1,
        path_2,
        dtype="float32"
    )

    # delete the folders
    shutil.rmtree(path_1)
    shutil.rmtree(path_2)
    
    return fad_score

for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs[texture_type]:
        original_path       = pair[0]
        reproduction_path   = pair[1]
        # Compute the FAD
        fad = fad_evaluation(original_path, reproduction_path, frechet)
        print("FAD score for pair: ", original_path, reproduction_path, "is", fad)

Using cache found in /home/esteban/.cache/torch/hub/harritaylor_torchvggish_master


Texture type:  bubbles
Creating folder 'evaluation_sounds_2/bubbles'...
Copied 'evaluation_sounds_2/bubbles.wav' to 'evaluation_sounds_2/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_2/bubbles_resynth'...
Copied 'evaluation_sounds_2/bubbles_resynth.wav' to 'evaluation_sounds_2/bubbles_resynth/bubbles_resynth.wav'
FAD score for pair:  evaluation_sounds_2/bubbles.wav evaluation_sounds_2/bubbles_resynth.wav is 21.373807079526642
Texture type:  fire
Creating folder 'evaluation_sounds_2/fire'...
Copied 'evaluation_sounds_2/fire.wav' to 'evaluation_sounds_2/fire/fire.wav'
Creating folder 'evaluation_sounds_2/fire_resynth'...
Copied 'evaluation_sounds_2/fire_resynth.wav' to 'evaluation_sounds_2/fire_resynth/fire_resynth.wav'
FAD score for pair:  evaluation_sounds_2/fire.wav evaluation_sounds_2/fire_resynth.wav is 2.525980359319645
Texture type:  keyboard
Creating folder 'evaluation_sounds_2/keyboard'...
Copied 'evaluation_sounds_2/keyboard.wav' to 'evaluation_sounds_2/keyboard/keybo

### 3. TexStat and MSS frame-by-frame score

In [2]:
import sys
import os

# Add the parent directory to the Python path
parent_dir = os.path.abspath('texstat/')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from texstat.segmentation import *
from texstat.functions import *
import torch
from texstat.fad import *
import texstat.torch_filterbanks.filterbanks as fb
import librosa
import torchaudio

# Multiscale Spectrogram Loss for comparison
def multiscale_fft(signal, scales=[4096, 2048, 1024, 512, 256, 128], overlap=.75):
    stfts = []
    for s in scales:
        S = torch.stft(
            signal,
            s,
            int(s * (1 - overlap)),
            s,
            torch.hann_window(s).to(signal),
            True,
            normalized=True,
            return_complex=True,
        ).abs()
        stfts.append(S)
    return stfts

def safe_log(x):
    return torch.log(x + 1e-7)

def multiscale_spectrogram_loss(x, x_hat):
    ori_stft = multiscale_fft(x)
    rec_stft = multiscale_fft(x_hat)
    loss = 0
    for s_x, s_y in zip(ori_stft, rec_stft):
        lin_loss = (s_x - s_y).abs().mean()
        log_loss = (safe_log(s_x) - safe_log(s_y)).abs().mean()
        loss = loss + lin_loss + log_loss
    return loss

# TexStat parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sr            = 44100
frame_size    = sr
N_filter_bank = 16
M_filter_bank = 6
N_moments     = 4
alpha         = torch.tensor([10, 1, 1/10, 1/100], device=device)
beta          = torch.tensor([1, 1, 1, 1, 1], device=device) 
new_sr, new_frame_size = sr // 4, frame_size // 4 # for downsampler
downsampler = torchaudio.transforms.Resample(sr, new_sr).to(device)
coch_fb     = fb.EqualRectangularBandwidth(frame_size, sr, N_filter_bank, 20, sr // 2)
mod_fb      = fb.Logarithmic(new_frame_size,       new_sr, M_filter_bank, 10, new_sr // 4)


for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs[texture_type]:
        print("Pair: ", pair)
        original_path         = pair[0]
        original_segments     = segment_audio(original_path, sr, sr, torch_type=True)
        reproduction_path     = pair[1]
        reproduction_segments = segment_audio(reproduction_path, sr, sr, torch_type=True)
        min_segments = min(len(original_segments), len(reproduction_segments))
        stats_loss = []
        mss_loss   = []
        for i in range(min_segments):
            og   = original_segments[i].to(device)
            fake = reproduction_segments[i].to(device)
            stats_loss_local = texstat_loss(og, fake, coch_fb, mod_fb, downsampler, N_moments, alpha, beta)
            mss_loss_local   = multiscale_spectrogram_loss(og, fake)
            stats_loss.append(stats_loss_local)  # Convert to scalar
            mss_loss.append(mss_loss_local)
        # torch stack
        stats_loss = torch.stack(stats_loss)
        mss_loss   = torch.stack(mss_loss)
        # Report mean and std
        stats_loss_mean = stats_loss.mean().item()
        stats_loss_std  = stats_loss.std().item()
        mss_loss_mean   = mss_loss.mean().item()
        mss_loss_std    = mss_loss.std().item()
        # Report
        print(f"Stats loss mean ± std: ${stats_loss_mean:.1f} \\pm {stats_loss_std:.1f}$")
        print(f"MSS loss mean ± std: ${mss_loss_mean:.1f} \\pm {mss_loss_std:.1f}$")

Texture type:  bubbles
Pair:  ['evaluation_sounds_2/bubbles.wav', 'evaluation_sounds_2/bubbles_resynth.wav']
Stats loss mean ± std: $0.7 \pm 0.1$
MSS loss mean ± std: $4.7 \pm 0.1$
Texture type:  fire
Pair:  ['evaluation_sounds_2/fire.wav', 'evaluation_sounds_2/fire_resynth.wav']
Stats loss mean ± std: $1.7 \pm 1.0$
MSS loss mean ± std: $4.5 \pm 0.2$
Texture type:  keyboard
Pair:  ['evaluation_sounds_2/keyboard.wav', 'evaluation_sounds_2/keyboard_resynth.wav']
Stats loss mean ± std: $20.0 \pm 7.7$
MSS loss mean ± std: $13.8 \pm 0.6$
Texture type:  rain
Pair:  ['evaluation_sounds_2/rain.wav', 'evaluation_sounds_2/rain_resynth.wav']
Stats loss mean ± std: $2.4 \pm 2.0$
MSS loss mean ± std: $9.1 \pm 0.4$
Texture type:  river
Pair:  ['evaluation_sounds_2/river.wav', 'evaluation_sounds_2/river_resynth.wav']
Stats loss mean ± std: $0.6 \pm 0.1$
MSS loss mean ± std: $6.7 \pm 0.3$
Texture type:  shards
Pair:  ['evaluation_sounds_2/shards.wav', 'evaluation_sounds_2/shards_resynth.wav']
Stats lo