## **DDSP Models: Evaluation**

This notebook contain the experiments to evaluate the DDSP models trained in this thesis.

In [1]:
#Git clone the ddsp_textures repo in the parent directory if you haven't already
import os
import subprocess

# Define the path where you want to clone the repository
repo_url = "https://github.com/cordutie/ddsp_textures.git"
clone_path = os.path.abspath(os.path.join('..', '..', 'ddsp_textures'))

# Check if the directory already exists
if not os.path.exists(clone_path):
    print(f"Cloning the repository to {clone_path}...")
    subprocess.run(["git", "clone", repo_url, clone_path])
    print("Repository cloned successfully.")
else:
    print(f"The repository already exists at {clone_path}.")

The repository already exists at /home/esteban/Desktop/ddsp_textures_thesis/ddsp_textures.


In [2]:
#Add the ddsp's parent directory to the Python path so that we can import the necessary modules
import sys

ddsp_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if ddsp_dir not in sys.path:
    sys.path.append(ddsp_dir)

In [3]:
from ddsp_textures.loss.functions import statistics_loss, multiscale_spectrogram_loss
import torch

def evaluation_multiscale_loss(signal_1, signal_2, frame_size):
    #size of signal tensor
    size = signal_1.shape[0]
    #segmentation
    segments_signal_1 = []
    segments_signal_2 = []
    number_of_frames = size // frame_size
    for i in range(0, number_of_frames):
        segments_signal_1.append(signal_1[i*frame_size:(i+1)*frame_size])
        segments_signal_2.append(signal_2[i*frame_size:(i+1)*frame_size])
    losses = []
    for i in range(len(segments_signal_1)):
        local_loss = multiscale_spectrogram_loss(segments_signal_1[i], segments_signal_2[i])
        losses.append(local_loss)
        # print(local_loss)
    losses_torch = torch.tensor(losses)
    losses_mean  = torch.mean(losses_torch) 
    losses_std   = torch.std(losses_torch) 
    return losses_mean, losses_std

def evaluation_textstat_loss(signal_1, signal_2, frame_size, N_F, erb_bank, log_bank):
    #size of signal tensor
    size = signal_1.shape[0]
    #segmentation
    segments_signal_1 = []
    segments_signal_2 = []
    number_of_frames = size // frame_size
    for i in range(0, number_of_frames):
        segments_signal_1.append(signal_1[i*frame_size:(i+1)*frame_size])
        segments_signal_2.append(signal_2[i*frame_size:(i+1)*frame_size])
        
    losses = []
    for i in range(len(segments_signal_1)):
        local_loss = statistics_loss(segments_signal_1[i], segments_signal_2[i], N_F, 44100, erb_bank, log_bank)
        losses.append(local_loss)
    losses_torch = torch.tensor(losses)
    losses_mean  = torch.mean(losses_torch)
    losses_std   = torch.std(losses_torch)
    return losses_mean, losses_std

### **1. Evaluation**

In [4]:
# import modules
import ddsp_textures.tests.tester as tester
import librosa 
import ddsp_textures.auxiliar.filterbanks as fb

# paths of models
models_path = "../data/models/DDSP"

#list of paths of folders inside of the models folder
model_folders = []
for entry in os.listdir(models_path):
    entry_path = os.path.join(models_path, entry)
    if os.path.isdir(entry_path):
        model_folders.append(entry_path)

frame_size_evaluation = 2**15
N_filterbank_evaluation = 16
erb_bank                            = fb.EqualRectangularBandwidth(frame_size_evaluation, 44100, N_filterbank_evaluation, 20, 44100 // 2)
new_frame_size, new_sampling_rate   = frame_size_evaluation // 4, 44100 // 4
log_bank                            = fb.Logarithmic(new_frame_size, new_sampling_rate, 6, 10, new_sampling_rate // 4)

for model_folder in model_folders:
    model, parameters_dict, loss_dict = tester.model_loader(model_folder, print_parameters=False)
    og_audio_path          = "../data/sounds/water_augmented.wav"
    og_audio = librosa.load(og_audio_path, sr=44100)[0]
    frame_size         = parameters_dict["frame_size"]
    features_annotator = parameters_dict["features_annotator"]
    sampling_rate      = parameters_dict["sampling_rate"] 
    
    content_og = tester.audio_preprocess(og_audio_path, frame_size, sampling_rate, features_annotator)
    og_resynthesis = tester.model_synthesizer(content_og, model, parameters_dict, random_shift=True)
    
    og_audio_evaluation_set       = og_audio[60*44100:90*44100]
    og_resynthesis_evaluation_set = og_resynthesis[60*44100:90*44100]
    
    og_audio_evaluation_set_torch       = torch.tensor(og_audio_evaluation_set)
    og_resynthesis_evaluation_set_torch = torch.tensor(og_resynthesis_evaluation_set)
    
    frame_size_evaluation = 2**15
    N_filterbank_evaluation = 16
    multiscale_loss_mean, multiscale_loss_std = evaluation_multiscale_loss(og_audio_evaluation_set_torch, og_resynthesis_evaluation_set_torch, frame_size_evaluation)
    textstat_loss_mean, textstat_loss_std     =   evaluation_textstat_loss(og_audio_evaluation_set_torch, og_resynthesis_evaluation_set_torch, frame_size_evaluation, N_filterbank_evaluation, erb_bank, log_bank)
    
    print(f"Model: {model_folder}")
    print(f"Multiscale loss mean: {multiscale_loss_mean}")
    print(f"Multiscale loss std: {multiscale_loss_std}")
    print(f"Textstat loss mean: {textstat_loss_mean}")
    print(f"Textstat loss std: {textstat_loss_std}")


  return torch._C._cuda_getDeviceCount() > 0


Model: ../data/models/DDSP/long_deep_noreg
Multiscale loss mean: 10.975004196166992
Multiscale loss std: 1.0862135887145996
Textstat loss mean: 0.7756296992301941
Textstat loss std: 0.04600538685917854
Model: ../data/models/DDSP/small_deep_reg
Multiscale loss mean: 6.71328592300415
Multiscale loss std: 0.8336761593818665
Textstat loss mean: 0.7748166918754578
Textstat loss std: 0.05992309749126434
Model: ../data/models/DDSP/small_shallow_noreg
Multiscale loss mean: 11.136171340942383
Multiscale loss std: 1.0861647129058838
Textstat loss mean: 0.7837122678756714
Textstat loss std: 0.0681658387184143
Model: ../data/models/DDSP/long_shallow_reg
Multiscale loss mean: 7.2713212966918945
Multiscale loss std: 1.0030699968338013
Textstat loss mean: 0.7809885740280151
Textstat loss std: 0.06261958926916122
Model: ../data/models/DDSP/long_shallow_noreg
Multiscale loss mean: 10.812215805053711
Multiscale loss std: 1.0561673641204834
Textstat loss mean: 0.7746486067771912
Textstat loss std: 0.0530