In [12]:
# !git clone https://github.com/cordutie/texstat.git
# !git clone https://github.com/gudgud96/frechet-audio-distance.git
# pip install -r frechet-audio-distance/requirements.txt
## rename the folder frechet-audio-distance to fad_repo
# mv frechet-audio-distance fad_repo

# FAD Resynthesis Scores

In [13]:
import sys
import os
import shutil
from texstat.fad import FAD_wrapper
from fad_repo.frechet_audio_distance.fad import FrechetAudioDistance

# Add the parent directory to the Python path
parent_dir = os.path.abspath('texstat/')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Pairs of paths to be compared
texture_types = ["bubbles", "fire", "keyboard", "rain", "river", "shards", "waterfall", "wind"]

pairs_texDSP = {}
for texture_type in texture_types:
    original_path = "evaluation_sounds_texDSP/"+texture_type+".wav"
    # make list of all files in evaluation_sounds/ that start with texture_type
    reproductions = ["evaluation_sounds_texDSP/"+reproduction for reproduction in os.listdir("evaluation_sounds_texDSP/") if reproduction.startswith(texture_type)]
    # list of pairs [original_path, reproduction_path]
    pairs_texDSP[texture_type] = []
    for reproduction in reproductions:
        if reproduction != original_path:
            pairs_texDSP[texture_type].append([original_path, reproduction])
    # sort
    pairs_texDSP[texture_type].sort(key=lambda x: x[1])

print(pairs_texDSP)

pairs_noisebandnet = {}
for texture_type in texture_types:
    original_path = "evaluation_sounds_noisebandnet/"+texture_type+".wav"
    # make list of all files in evaluation_sounds/ that start with texture_type
    reproductions = ["evaluation_sounds_noisebandnet/"+reproduction for reproduction in os.listdir("evaluation_sounds_noisebandnet/") if reproduction.startswith(texture_type)]
    # list of pairs [original_path, reproduction_path]
    pairs_noisebandnet[texture_type] = []
    for reproduction in reproductions:
        if reproduction != original_path:
            pairs_noisebandnet[texture_type].append([original_path, reproduction])
    # sort
    pairs_noisebandnet[texture_type].sort(key=lambda x: x[1])

print(pairs_noisebandnet)

def create_folder_from_wav(wav_path):
    # Get the name of the wav file without the extension
    wav_name = os.path.splitext(os.path.basename(wav_path))[0]
    # Get the directory where the wav file is located
    base_dir = os.path.dirname(wav_path)
    # Full path to the new folder
    folder_path = os.path.join(base_dir, wav_name)
    
    # Check if the folder already exists
    if os.path.exists(folder_path):
        print(f"Folder '{folder_path}' already exists.")
    else:
        print(f"Creating folder '{folder_path}'...")
        os.makedirs(folder_path)

    # Destination path for the copied wav file
    dest_wav_path = os.path.join(folder_path, os.path.basename(wav_path))
    
    # Copy the wav file to the new folder
    shutil.copy2(wav_path, dest_wav_path)
    print(f"Copied '{wav_path}' to '{dest_wav_path}'")

    return folder_path

{'bubbles': [['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_1.wav'], ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_2.wav'], ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_3.wav'], ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_4.wav']], 'fire': [['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_1.wav'], ['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_2.wav'], ['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_3.wav'], ['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_4.wav'], ['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_5.wav']], 'keyboard': [['evaluation_sounds_texDSP/keyboard.wav', 'evaluation_sounds_texDSP/keyboard_1.wav']], 'rain': [['evaluation_sounds_texDSP/rain.wav', 'evaluation_sounds_texDSP/rain_1.wav'], ['evaluation_sounds_texDSP/rain.wav', 'evaluation_sounds_texDSP

### 1. FAD using TexStat evaluation over texDSP and noisebandnet

In [14]:
def fad_texstat_evaluation(wav_path_1, wav_path_2, FAD_texstat):
    # Create a folder for the wav file
    path_1 = create_folder_from_wav(wav_path_1)
    path_2 = create_folder_from_wav(wav_path_2)
    
    # Compute the FAD score
    fad_score = FAD_texstat.score(path_1, path_2)

    # delete the folders
    shutil.rmtree(path_1)
    shutil.rmtree(path_2)
    
    return fad_score

FAD_texstat = FAD_wrapper(frame_size = 44100, sampling_rate = 44100, device='cpu')

In [15]:
# Evaluate the pairs for texDSP
results={}
for texture_type, pairs in pairs_texDSP.items():
    print(f"Evaluating texture type: {texture_type}")
    for pair in pairs:
        score = fad_texstat_evaluation(pair[0], pair[1], FAD_texstat)
        results[pair[1]] = score

print("Results for texDSP:")
for texture_type, scores in results.items():
    print(f"{texture_type}: {scores}")

Evaluating texture type: bubbles
Creating folder 'evaluation_sounds_texDSP/bubbles'...
Copied 'evaluation_sounds_texDSP/bubbles.wav' to 'evaluation_sounds_texDSP/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_texDSP/bubbles_1'...
Copied 'evaluation_sounds_texDSP/bubbles_1.wav' to 'evaluation_sounds_texDSP/bubbles_1/bubbles_1.wav'
Processing folder: bubbles
Found 1 files in evaluation_sounds_texDSP/bubbles
    Segmented 120 segments from evaluation_sounds_texDSP/bubbles/bubbles.wav
Total segments: 120
Processed 120 segments in evaluation_sounds_texDSP/bubbles
Processing folder: bubbles_1
Found 1 files in evaluation_sounds_texDSP/bubbles_1
    Segmented 119 segments from evaluation_sounds_texDSP/bubbles_1/bubbles_1.wav
Total segments: 119
Processed 119 segments in evaluation_sounds_texDSP/bubbles_1
Creating folder 'evaluation_sounds_texDSP/bubbles'...
Copied 'evaluation_sounds_texDSP/bubbles.wav' to 'evaluation_sounds_texDSP/bubbles/bubbles.wav'
Creating folder 'evaluation_sound

In [16]:
# Evaluate the pairs for noisebandnet
results={}
for texture_type, pairs in pairs_noisebandnet.items():
    print(f"Evaluating texture type: {texture_type}")
    for pair in pairs:
        score = fad_texstat_evaluation(pair[0], pair[1], FAD_texstat)
        results[pair[1]] = score

print("Results for noisebandnet:")
for texture_type, scores in results.items():
    print(f"{texture_type}: {scores}")

Evaluating texture type: bubbles
Creating folder 'evaluation_sounds_noisebandnet/bubbles'...
Copied 'evaluation_sounds_noisebandnet/bubbles.wav' to 'evaluation_sounds_noisebandnet/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_noisebandnet/bubbles_resynth'...
Copied 'evaluation_sounds_noisebandnet/bubbles_resynth.wav' to 'evaluation_sounds_noisebandnet/bubbles_resynth/bubbles_resynth.wav'
Processing folder: bubbles
Found 1 files in evaluation_sounds_noisebandnet/bubbles
    Segmented 120 segments from evaluation_sounds_noisebandnet/bubbles/bubbles.wav
Total segments: 120
Processed 120 segments in evaluation_sounds_noisebandnet/bubbles
Processing folder: bubbles_resynth
Found 1 files in evaluation_sounds_noisebandnet/bubbles_resynth
    Segmented 119 segments from evaluation_sounds_noisebandnet/bubbles_resynth/bubbles_resynth.wav
Total segments: 119
Processed 119 segments in evaluation_sounds_noisebandnet/bubbles_resynth
Evaluating texture type: fire
Creating folder 'evaluation

### 2. FAD using VGGish

In [17]:
def fad_evaluation(wav_path_1, wav_path_2, frechet):
    # Create a folder for the wav file
    path_1 = create_folder_from_wav(wav_path_1)
    path_2 = create_folder_from_wav(wav_path_2)
    
    # Compute the FAD score
    fad_score = frechet.score(
        path_1,
        path_2,
        dtype="float32"
    )

    # delete the folders
    shutil.rmtree(path_1)
    shutil.rmtree(path_2)
    
    return fad_score

# to use `vggish`
frechet = FrechetAudioDistance(
    model_name="vggish",
    sample_rate=16000,
    use_pca=False, 
    use_activation=False,
    verbose=False
)

Using cache found in /home/esteban/.cache/torch/hub/harritaylor_torchvggish_master


In [18]:
# Evaluate the pairs for texDSP
results={}
for texture_type, pairs in pairs_texDSP.items():
    print(f"Evaluating texture type: {texture_type}")
    for pair in pairs:
        score = fad_evaluation(pair[0], pair[1], frechet)
        results[pair[1]] = score

print("Results for noisebandnet:")
for texture_type, scores in results.items():
    print(f"{texture_type}: {scores}")

Evaluating texture type: bubbles
Creating folder 'evaluation_sounds_texDSP/bubbles'...
Copied 'evaluation_sounds_texDSP/bubbles.wav' to 'evaluation_sounds_texDSP/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_texDSP/bubbles_1'...
Copied 'evaluation_sounds_texDSP/bubbles_1.wav' to 'evaluation_sounds_texDSP/bubbles_1/bubbles_1.wav'
Creating folder 'evaluation_sounds_texDSP/bubbles'...
Copied 'evaluation_sounds_texDSP/bubbles.wav' to 'evaluation_sounds_texDSP/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_texDSP/bubbles_2'...
Copied 'evaluation_sounds_texDSP/bubbles_2.wav' to 'evaluation_sounds_texDSP/bubbles_2/bubbles_2.wav'
Creating folder 'evaluation_sounds_texDSP/bubbles'...
Copied 'evaluation_sounds_texDSP/bubbles.wav' to 'evaluation_sounds_texDSP/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_texDSP/bubbles_3'...
Copied 'evaluation_sounds_texDSP/bubbles_3.wav' to 'evaluation_sounds_texDSP/bubbles_3/bubbles_3.wav'
Creating folder 'evaluation_sounds_texDSP/b

In [19]:
# Evaluate the pairs for noisebandnet
results={}
for texture_type, pairs in pairs_noisebandnet.items():
    print(f"Evaluating texture type: {texture_type}")
    for pair in pairs:
        score = fad_evaluation(pair[0], pair[1], frechet)
        results[pair[1]] = score

print("Results for noisebandnet:")
for texture_type, scores in results.items():
    print(f"{texture_type}: {scores}")

Evaluating texture type: bubbles
Creating folder 'evaluation_sounds_noisebandnet/bubbles'...
Copied 'evaluation_sounds_noisebandnet/bubbles.wav' to 'evaluation_sounds_noisebandnet/bubbles/bubbles.wav'
Creating folder 'evaluation_sounds_noisebandnet/bubbles_resynth'...
Copied 'evaluation_sounds_noisebandnet/bubbles_resynth.wav' to 'evaluation_sounds_noisebandnet/bubbles_resynth/bubbles_resynth.wav'
Evaluating texture type: fire
Creating folder 'evaluation_sounds_noisebandnet/fire'...
Copied 'evaluation_sounds_noisebandnet/fire.wav' to 'evaluation_sounds_noisebandnet/fire/fire.wav'
Creating folder 'evaluation_sounds_noisebandnet/fire_resynth'...
Copied 'evaluation_sounds_noisebandnet/fire_resynth.wav' to 'evaluation_sounds_noisebandnet/fire_resynth/fire_resynth.wav'
Evaluating texture type: keyboard
Creating folder 'evaluation_sounds_noisebandnet/keyboard'...
Copied 'evaluation_sounds_noisebandnet/keyboard.wav' to 'evaluation_sounds_noisebandnet/keyboard/keyboard.wav'
Creating folder 'ev

### 3. TexStat and MSS frame-by-frame score

In [20]:
import sys
import os

# Add the parent directory to the Python path
parent_dir = os.path.abspath('texstat/')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from texstat.segmentation import *
from texstat.functions import *
import torch
from texstat.fad import *
import texstat.torch_filterbanks.filterbanks as fb
import librosa
import torchaudio

# Multiscale Spectrogram Loss for comparison
def multiscale_fft(signal, scales=[4096, 2048, 1024, 512, 256, 128], overlap=.75):
    stfts = []
    for s in scales:
        S = torch.stft(
            signal,
            s,
            int(s * (1 - overlap)),
            s,
            torch.hann_window(s).to(signal),
            True,
            normalized=True,
            return_complex=True,
        ).abs()
        stfts.append(S)
    return stfts

def safe_log(x):
    return torch.log(x + 1e-7)

def multiscale_spectrogram_loss(x, x_hat):
    ori_stft = multiscale_fft(x)
    rec_stft = multiscale_fft(x_hat)
    loss = 0
    for s_x, s_y in zip(ori_stft, rec_stft):
        lin_loss = (s_x - s_y).abs().mean()
        log_loss = (safe_log(s_x) - safe_log(s_y)).abs().mean()
        loss = loss + lin_loss + log_loss
    return loss

# TexStat parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sr            = 44100
frame_size    = sr
N_filter_bank = 16
M_filter_bank = 6
N_moments     = 4
alpha         = torch.tensor([10, 1, 1/10, 1/100], device=device)
beta          = torch.tensor([1, 1, 1, 1, 1], device=device) 
new_sr, new_frame_size = sr // 4, frame_size // 4 # for downsampler
downsampler = torchaudio.transforms.Resample(sr, new_sr).to(device)
coch_fb     = fb.EqualRectangularBandwidth(frame_size, sr, N_filter_bank, 20, sr // 2)
mod_fb      = fb.Logarithmic(new_frame_size,       new_sr, M_filter_bank, 10, new_sr // 4)

In [21]:
# Evaluate using texStat and MSS on texDSP
for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs_texDSP[texture_type]:
        print("Pair: ", pair)
        original_path         = pair[0]
        original_segments     = segment_audio(original_path, sr, sr, torch_type=True)
        reproduction_path     = pair[1]
        reproduction_segments = segment_audio(reproduction_path, sr, sr, torch_type=True)
        min_segments = min(len(original_segments), len(reproduction_segments))
        stats_loss = []
        mss_loss   = []
        for i in range(min_segments):
            og   = original_segments[i].to(device)
            fake = reproduction_segments[i].to(device)
            stats_loss_local = texstat_loss(og, fake, coch_fb, mod_fb, downsampler, N_moments, alpha, beta)
            mss_loss_local   = multiscale_spectrogram_loss(og, fake)
            stats_loss.append(stats_loss_local)  # Convert to scalar
            mss_loss.append(mss_loss_local)
        # torch stack
        stats_loss = torch.stack(stats_loss)
        mss_loss   = torch.stack(mss_loss)
        # Report mean and std
        stats_loss_mean = stats_loss.mean().item()
        stats_loss_std  = stats_loss.std().item()
        mss_loss_mean   = mss_loss.mean().item()
        mss_loss_std    = mss_loss.std().item()
        # Report
        print(f"Stats loss mean ± std: ${stats_loss_mean:.1f} \\pm {stats_loss_std:.1f}$")
        print(f"MSS loss mean ± std: ${mss_loss_mean:.1f} \\pm {mss_loss_std:.1f}$")

Texture type:  bubbles
Pair:  ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_1.wav']
Stats loss mean ± std: $1.3 \pm 0.3$
MSS loss mean ± std: $7.9 \pm 0.7$
Pair:  ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_2.wav']
Stats loss mean ± std: $1.4 \pm 0.3$
MSS loss mean ± std: $7.4 \pm 0.5$
Pair:  ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_3.wav']
Stats loss mean ± std: $1.2 \pm 0.3$
MSS loss mean ± std: $6.6 \pm 0.3$
Pair:  ['evaluation_sounds_texDSP/bubbles.wav', 'evaluation_sounds_texDSP/bubbles_4.wav']
Stats loss mean ± std: $1.5 \pm 0.3$
MSS loss mean ± std: $8.1 \pm 0.5$
Texture type:  fire
Pair:  ['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_1.wav']
Stats loss mean ± std: $2.9 \pm 2.0$
MSS loss mean ± std: $10.1 \pm 1.2$
Pair:  ['evaluation_sounds_texDSP/fire.wav', 'evaluation_sounds_texDSP/fire_2.wav']
Stats loss mean ± std: $2.9 \pm 2.1$
MSS loss mean ± std: $9.6 \pm 1

In [22]:
# Evaluate using texStat and MSS on noisebandnet
for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs_noisebandnet[texture_type]:
        print("Pair: ", pair)
        original_path         = pair[0]
        original_segments     = segment_audio(original_path, sr, sr, torch_type=True)
        reproduction_path     = pair[1]
        reproduction_segments = segment_audio(reproduction_path, sr, sr, torch_type=True)
        min_segments = min(len(original_segments), len(reproduction_segments))
        stats_loss = []
        mss_loss   = []
        for i in range(min_segments):
            og   = original_segments[i].to(device)
            fake = reproduction_segments[i].to(device)
            stats_loss_local = texstat_loss(og, fake, coch_fb, mod_fb, downsampler, N_moments, alpha, beta)
            mss_loss_local   = multiscale_spectrogram_loss(og, fake)
            stats_loss.append(stats_loss_local)  # Convert to scalar
            mss_loss.append(mss_loss_local)
        # torch stack
        stats_loss = torch.stack(stats_loss)
        mss_loss   = torch.stack(mss_loss)
        # Report mean and std
        stats_loss_mean = stats_loss.mean().item()
        stats_loss_std  = stats_loss.std().item()
        mss_loss_mean   = mss_loss.mean().item()
        mss_loss_std    = mss_loss.std().item()
        # Report
        print(f"Stats loss mean ± std: ${stats_loss_mean:.1f} \\pm {stats_loss_std:.1f}$")
        print(f"MSS loss mean ± std: ${mss_loss_mean:.1f} \\pm {mss_loss_std:.1f}$")

Texture type:  bubbles
Pair:  ['evaluation_sounds_noisebandnet/bubbles.wav', 'evaluation_sounds_noisebandnet/bubbles_resynth.wav']
Stats loss mean ± std: $0.7 \pm 0.1$
MSS loss mean ± std: $4.7 \pm 0.1$
Texture type:  fire
Pair:  ['evaluation_sounds_noisebandnet/fire.wav', 'evaluation_sounds_noisebandnet/fire_resynth.wav']
Stats loss mean ± std: $1.7 \pm 1.0$
MSS loss mean ± std: $4.5 \pm 0.2$
Texture type:  keyboard
Pair:  ['evaluation_sounds_noisebandnet/keyboard.wav', 'evaluation_sounds_noisebandnet/keyboard_resynth.wav']
Stats loss mean ± std: $20.0 \pm 7.7$
MSS loss mean ± std: $13.8 \pm 0.6$
Texture type:  rain
Pair:  ['evaluation_sounds_noisebandnet/rain.wav', 'evaluation_sounds_noisebandnet/rain_resynth.wav']
Stats loss mean ± std: $2.4 \pm 2.0$
MSS loss mean ± std: $9.1 \pm 0.4$
Texture type:  river
Pair:  ['evaluation_sounds_noisebandnet/river.wav', 'evaluation_sounds_noisebandnet/river_resynth.wav']
Stats loss mean ± std: $0.6 \pm 0.1$
MSS loss mean ± std: $6.7 \pm 0.3$
Text