In [11]:
# !git clone https://github.com/cordutie/texstat.git

# FAD Resynthesis Scores

In [2]:
import sys
import os

# Add the parent directory to the Python path
parent_dir = os.path.abspath('texstat/')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Pairs of paths to be compared
texture_types = ["bubbles", "fire", "keyboard", "rain", "river", "shards", "waterfall", "wind"]

pairs = {}
for texture_type in texture_types:
    original_path = "evaluation_sounds/"+texture_type+".wav"
    # make list of all files in evaluation_sounds/ that start with texture_type
    reproductions = ["evaluation_sounds/"+reproduction for reproduction in os.listdir("evaluation_sounds/") if reproduction.startswith(texture_type)]
    # list of pairs [original_path, reproduction_path]
    pairs[texture_type] = []
    for reproduction in reproductions:
        if reproduction != original_path:
            pairs[texture_type].append([original_path, reproduction])
    # sort
    pairs[texture_type].sort(key=lambda x: x[1])

print(pairs)

{'bubbles': [['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_1.wav'], ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_2.wav'], ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_3.wav'], ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_4.wav']], 'fire': [['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_1.wav'], ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_2.wav'], ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_3.wav'], ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_4.wav'], ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_5.wav']], 'keyboard': [['evaluation_sounds/keyboard.wav', 'evaluation_sounds/keyboard_1.wav']], 'rain': [['evaluation_sounds/rain.wav', 'evaluation_sounds/rain_1.wav'], ['evaluation_sounds/rain.wav', 'evaluation_sounds/rain_2.wav']], 'river': [['evaluation_sounds/river.wav', 'evaluation_sounds/river_1.wav'], ['evaluation_sounds/river.wav', 'evaluation_sounds/river_2.wav'], ['evaluati

### 1. FAD using TexStat

In [2]:
from texstat.fad import *
import texstat.torch_filterbanks.filterbanks as fb
import librosa
import torchaudio

# texstat properties
sr            = 44100
frame_size    = 44100
N_filter_bank = 16
M_filter_bank = 6
new_sr, new_frame_size = sr // 4, frame_size // 4 # for downsampler
downsampler = torchaudio.transforms.Resample(sr, new_sr)
coch_fb     = fb.EqualRectangularBandwidth(frame_size, sr, N_filter_bank, 20, sr // 2)
mod_fb      = fb.Logarithmic(new_frame_size,       new_sr, M_filter_bank, 10, new_sr // 4)

for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs[texture_type]:
        original_path       = pair[0]
        original_signal     = librosa.load(original_path, sr=44100, mono=True)[0]
        reproduction_path   = pair[1]
        reproduction_signal = librosa.load(reproduction_path, sr=44100, mono=True)[0]
        # Compute the FAD
        fad = compute_fad_from_signals(original_signal, reproduction_signal, frame_size, coch_fb, mod_fb, downsampler)
        print("FAD score for pair: ", original_path, reproduction_path, "is", fad)



Texture type:  bubbles
Segmented 120 segments from signal
Processed 120 segments from signal
Segmented 119 segments from signal
Processed 119 segments from signal
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_1.wav is 1138.8335603029718
Segmented 120 segments from signal
Processed 120 segments from signal
Segmented 118 segments from signal
Processed 118 segments from signal
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_2.wav is 83.805898021874
Segmented 120 segments from signal
Processed 120 segments from signal
Segmented 118 segments from signal
Processed 118 segments from signal
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_3.wav is 63.7318939554856
Segmented 120 segments from signal
Processed 120 segments from signal
Segmented 119 segments from signal
Processed 119 segments from signal
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_4.wav is 90.91121753775059
Tex

### 2. FAD using VGGish

In [8]:
from torchvggish import vggish, vggish_input

# Initialise model and download weights
embedding_model = vggish()
embedding_model.eval()

for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs[texture_type]:
        original_path       = pair[0]
        original_preprocessed = vggish_input.wavfile_to_examples(original_path)
        original_embeddings   = embedding_model.forward(original_preprocessed).detach().numpy()
        reproduction_path   = pair[1]
        reproduction_preprocessed = vggish_input.wavfile_to_examples(reproduction_path)
        reproduction_embeddings = embedding_model.forward(reproduction_preprocessed).detach().numpy()
        # Compute the FAD
        fad = compute_fad_from_embeddings(original_embeddings, reproduction_embeddings)
        print("FAD score for pair: ", original_path, reproduction_path, "is", fad)

Texture type:  bubbles
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_1.wav is 1134844.7413896786
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_2.wav is 1123190.9084297614
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_3.wav is 1021556.0924389309
FAD score for pair:  evaluation_sounds/bubbles.wav evaluation_sounds/bubbles_4.wav is 1562180.8857099612
Texture type:  fire
FAD score for pair:  evaluation_sounds/fire.wav evaluation_sounds/fire_1.wav is 765102.6467977822
FAD score for pair:  evaluation_sounds/fire.wav evaluation_sounds/fire_2.wav is 726847.7355396337
FAD score for pair:  evaluation_sounds/fire.wav evaluation_sounds/fire_3.wav is 765337.581345021
FAD score for pair:  evaluation_sounds/fire.wav evaluation_sounds/fire_4.wav is 701855.4972806366
FAD score for pair:  evaluation_sounds/fire.wav evaluation_sounds/fire_5.wav is 772594.884657207
Texture type:  keyboard
FAD score for pair:  eval

### 3. TexStat and MSS frame-by-frame score

In [14]:
import sys
import os

# Add the parent directory to the Python path
parent_dir = os.path.abspath('texstat/')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from texstat.segmentation import *
from texstat.functions import *
import torch
from texstat.fad import *
import texstat.torch_filterbanks.filterbanks as fb
import librosa
import torchaudio

# Multiscale Spectrogram Loss for comparison
def multiscale_fft(signal, scales=[4096, 2048, 1024, 512, 256, 128], overlap=.75):
    stfts = []
    for s in scales:
        S = torch.stft(
            signal,
            s,
            int(s * (1 - overlap)),
            s,
            torch.hann_window(s).to(signal),
            True,
            normalized=True,
            return_complex=True,
        ).abs()
        stfts.append(S)
    return stfts

def safe_log(x):
    return torch.log(x + 1e-7)

def multiscale_spectrogram_loss(x, x_hat):
    ori_stft = multiscale_fft(x)
    rec_stft = multiscale_fft(x_hat)
    loss = 0
    for s_x, s_y in zip(ori_stft, rec_stft):
        lin_loss = (s_x - s_y).abs().mean()
        log_loss = (safe_log(s_x) - safe_log(s_y)).abs().mean()
        loss = loss + lin_loss + log_loss
    return loss

# TexStat parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sr            = 44100
frame_size    = sr
N_filter_bank = 16
M_filter_bank = 6
N_moments     = 4
alpha         = torch.tensor([10, 1, 1/10, 1/100], device=device)
beta          = torch.tensor([1, 1, 1, 1, 1], device=device) 
new_sr, new_frame_size = sr // 4, frame_size // 4 # for downsampler
downsampler = torchaudio.transforms.Resample(sr, new_sr).to(device)
coch_fb     = fb.EqualRectangularBandwidth(frame_size, sr, N_filter_bank, 20, sr // 2)
mod_fb      = fb.Logarithmic(new_frame_size,       new_sr, M_filter_bank, 10, new_sr // 4)


for texture_type in texture_types:
    print("Texture type: ", texture_type)
    for pair in pairs[texture_type]:
        print("Pair: ", pair)
        original_path         = pair[0]
        original_segments     = segment_audio(original_path, sr, sr, torch_type=True)
        reproduction_path     = pair[1]
        reproduction_segments = segment_audio(reproduction_path, sr, sr, torch_type=True)
        min_segments = min(len(original_segments), len(reproduction_segments))
        stats_loss = []
        mss_loss   = []
        for i in range(min_segments):
            og   = original_segments[i].to(device)
            fake = reproduction_segments[i].to(device)
            stats_loss_local = texstat_loss(og, fake, coch_fb, mod_fb, downsampler, N_moments, alpha, beta)
            mss_loss_local   = multiscale_spectrogram_loss(og, fake)
            stats_loss.append(stats_loss_local)  # Convert to scalar
            mss_loss.append(mss_loss_local)
        # torch stack
        stats_loss = torch.stack(stats_loss)
        mss_loss   = torch.stack(mss_loss)
        # Report mean and std
        stats_loss_mean = stats_loss.mean().item()
        stats_loss_std  = stats_loss.std().item()
        mss_loss_mean   = mss_loss.mean().item()
        mss_loss_std    = mss_loss.std().item()
        # Report
        print(f"Stats loss mean ± std: ${stats_loss_mean:.1f} \\pm {stats_loss_std:.1f}$")
        print(f"MSS loss mean ± std: ${mss_loss_mean:.1f} \\pm {mss_loss_std:.1f}$")

Texture type:  bubbles
Pair:  ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_1.wav']
Stats loss mean ± std: $1.3 \pm 0.3$
MSS loss mean ± std: $7.9 \pm 0.7$
Pair:  ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_2.wav']
Stats loss mean ± std: $1.4 \pm 0.3$
MSS loss mean ± std: $7.4 \pm 0.5$
Pair:  ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_3.wav']
Stats loss mean ± std: $1.2 \pm 0.3$
MSS loss mean ± std: $6.6 \pm 0.3$
Pair:  ['evaluation_sounds/bubbles.wav', 'evaluation_sounds/bubbles_4.wav']
Stats loss mean ± std: $1.5 \pm 0.3$
MSS loss mean ± std: $8.1 \pm 0.5$
Texture type:  fire
Pair:  ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_1.wav']
Stats loss mean ± std: $2.9 \pm 2.0$
MSS loss mean ± std: $10.1 \pm 1.2$
Pair:  ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_2.wav']
Stats loss mean ± std: $2.9 \pm 2.1$
MSS loss mean ± std: $9.6 \pm 1.3$
Pair:  ['evaluation_sounds/fire.wav', 'evaluation_sounds/fire_3.wav']
Stats loss