# Parmegiani Resynthesis Evaluation

Evaluates FM and Additive candidate audio files against a target cello signal.

## Metrics used (lower is better unless noted)
- Time MSE: Mean-squared error in time domain after RMS normalization and short-window alignment.
- Cosine distance (log-magnitude STFT): Cosine distance of flattened log |STFT|.
- Pearson distance (log-magnitude STFT): 1 − Pearson correlation of flattened log |STFT|.
- Spectral convergence: ||S − Ŝ|| / (||S|| + ε); emphasizes relative spectral errors.
- Log-spectral distance (dB): RMSE between amplitude-in-dB spectra; classic perceptual scale.
- Itakura–Saito divergence (power): D_IS(S² || Ŝ²); scale-sensitive spectral mismatch.
- MFCC L2: L2 distance of MFCC sequences (aligned in time to min length).
- Spectral flatness L1: Mean absolute difference of spectral flatness over time.
- Centroid RMSE (Hz): RMSE of spectral centroid trajectories.
- Rolloff RMSE (Hz): RMSE of spectral rolloff trajectories.
- Multi-resolution STFT (MR-STFT): Mean of log-magnitude L1 + spectral convergence across several STFT configs.
- Log–mel L1: L1 distance between log-mel spectrograms.
- Combined mel_mrstft: 0.5 · (log–mel L1 + MR-STFT).

Composite (if shown) is the unweighted mean of normalized metrics present.


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import librosa, librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

SR = 44100
N_FFT = 2048
HOP = 512
N_MELS = 64
N_MFCC = 20
MAX_ALIGN_SEC = 0.05
EPS = 1e-10

print('Parsing target and candidates from examples/parm.md')


Parsing target and candidates from examples/parm.md


In [2]:
# # Parse target and candidates from examples/parm.md
# from pathlib import Path
# import sys
# sys.path.insert(0, str(Path('.').resolve()))
# from tools.md_samples_parser import parse_samples_from_markdown_file

# md_path = Path('examples/parm.md')
# parsed = parse_samples_from_markdown_file(md_path)

# target = parsed.get('target')
# samples = parsed.get('samples', [])

# assert target and target.get('audio'), 'No Target spectra entry found in examples/parm.md'

# target_audio = Path(target['audio'])
# entries = [{'title': s['title'], 'audio': s['audio']} for s in samples]

# print('Using target =', target_audio)
# print('Candidates =', len(entries))


In [3]:
# # Export an explicit "target & candidates" cell from the parsed markdown
# from pathlib import Path, PureWindowsPath
# import sys

# # --- reuse your existing parsing ---
# sys.path.insert(0, str(Path('.').resolve()))
# from tools.md_samples_parser import parse_samples_from_markdown_file

# md_path = Path('examples/parm.md')
# parsed = parse_samples_from_markdown_file(md_path)

# target = parsed.get('target')
# samples = parsed.get('samples', [])

# assert target and target.get('audio'), 'No Target spectra entry found in examples/parm.md'

# # --- helpers to format Windows-ish raw strings like r'rendered_audio\\file.wav' ---
# def to_win_raw(p) -> str:
#     """
#     Convert a path to a Windows-looking raw string literal with doubled backslashes.
#     Example -> r'rendered_audio\\file.wav'
#     """
#     s = str(PureWindowsPath(p))              # ensure backslashes
#     s = s.replace('\\', '\\\\')              # double them
#     return f"r'{s}'"                         # raw string literal for display

# # --- collect data ---
# target_audio = Path(target['audio'])
# entries = [(s['title'], s['audio']) for s in samples]

# # --- (optional) simple grouping based on title prefixes like 'FM ...' vs 'Add ...' ---
# def group_key(title: str):
#     first = title.split()[0].lower()
#     if first in ('fm', 'add', 'additive'):
#         return first
#     return 'other'

# groups = {'fm': [], 'add': [], 'additive': [], 'other': []}
# for t, a in entries:
#     groups[group_key(t)].append((t, a))

# # --- build the explicit cell text ---
# lines = []
# lines.append("# Define explicit target and candidate file list\n")
# lines.append(f"target_audio = Path({to_win_raw(target_audio)})\n")
# lines.append("candidates = [")
# if groups['fm']:
#     lines.append("    # FM (BH/DA/DE)")
#     for t, a in groups['fm']:
#         lines.append(f"    ({t!r}, {to_win_raw(a)}),")
# if groups['add'] or groups['additive']:
#     lines.append("\n    # Additive (BH/DA/DE)")
#     for t, a in (groups['add'] + groups['additive']):
#         lines.append(f"    ({t!r}, {to_win_raw(a)}),")
# # any remaining
# if groups['other']:
#     lines.append("\n    # Other")
#     for t, a in groups['other']:
#         lines.append(f"    ({t!r}, {to_win_raw(a)}),")
# lines.append("]\n")
# lines.append("entries = [{'title': t, 'audio': a} for (t, a) in candidates]\n")
# lines.append("print('Using target =', target_audio)")
# lines.append("print('Candidates =', len(entries))")

# explicit_cell = "\n".join(lines)
# print(explicit_cell)

# # If you want to save it next to your notebook for reuse:
# # Path('explicit_target_candidates_cell.py').write_text(explicit_cell, encoding='utf-8')
# # print('Wrote explicit_target_candidates_cell.py')


In [4]:
target_audio = Path(r'rendered_audio\\additive_from_parm_2.0s_20250908-142704.wav')

candidates = [

    # Other
    ('Optimized FM with DE + cosine', r'rendered_audio\\optimized_output_fm_parm_de_cosine_20250908-150414.wav'),
    ('Optimized FM with DE + euclidean', r'rendered_audio\\optimized_output_fm_parm_de_euclidean_20250908-150743.wav'),
    ('Optimized FM with DE + itakura_saito', r'rendered_audio\\optimized_output_fm_parm_de_itakura_saito_20250908-145757.wav'),
    ('Optimized FM with DE + kl', r'rendered_audio\\optimized_output_fm_parm_de_kl_20250908-151130.wav'),
    ('Optimized FM with DE + manhattan', r'rendered_audio\\optimized_output_fm_parm_de_manhattan_20250908-150902.wav'),
    ('Optimized FM with DE + mfcc', r'rendered_audio\\optimized_output_fm_parm_de_mfcc_20250908-145245.wav'),
    ('Optimized FM with DE + pearson', r'rendered_audio\\optimized_output_fm_parm_de_pearson_20250908-144626.wav'),
    ('Optimized FM with DE + spectral_convergence', r'rendered_audio\\optimized_output_fm_parm_de_spectral_convergence_20250908-150114.wav'),
    ('Optimized FM with DA + cosine', r'rendered_audio\\optimized_output_fm_parm_da_cosine_20250908-150447.wav'),
    ('Optimized FM with DA + euclidean', r'rendered_audio\\optimized_output_fm_parm_da_euclidean_20250908-150812.wav'),
    ('Optimized FM with DA + itakura_saito', r'rendered_audio\\optimized_output_fm_parm_da_itakura_saito_20250908-145822.wav'),
    ('Optimized FM with DA + kl', r'rendered_audio\\optimized_output_fm_parm_da_kl_20250908-151159.wav'),
    ('Optimized FM with DA + manhattan', r'rendered_audio\\optimized_output_fm_parm_da_manhattan_20250908-150924.wav'),
    ('Optimized FM with DA + mfcc', r'rendered_audio\\optimized_output_fm_parm_da_mfcc_20250908-145411.wav'),
    ('Optimized FM with DA + pearson', r'rendered_audio\\optimized_output_fm_parm_da_pearson_20250908-144704.wav'),
    ('Optimized FM with DA + spectral_convergence', r'rendered_audio\\optimized_output_fm_parm_da_spectral_convergence_20250908-150143.wav'),
    ('Optimized FM with BH + cosine', r'rendered_audio\\optimized_output_fm_parm_bh_cosine_20250908-150534.wav'),
    ('Optimized FM with BH + euclidean', r'rendered_audio\\optimized_output_fm_parm_bh_euclidean_20250908-150834.wav'),
    ('Optimized FM with BH + itakura_saito', r'rendered_audio\\optimized_output_fm_parm_bh_itakura_saito_20250908-145858.wav'),
    ('Optimized FM with BH + kl', r'rendered_audio\\optimized_output_fm_parm_bh_kl_20250908-151243.wav'),
    ('Optimized FM with BH + manhattan', r'rendered_audio\\optimized_output_fm_parm_bh_manhattan_20250908-150959.wav'),
    ('Optimized FM with BH + mfcc', r'rendered_audio\\optimized_output_fm_parm_bh_mfcc_20250908-145634.wav'),
    ('Optimized FM with BH + pearson', r'rendered_audio\\optimized_output_fm_parm_bh_pearson_20250908-144818.wav'),
    ('Optimized FM with BH + spectral_convergence', r'rendered_audio\\optimized_output_fm_parm_bh_spectral_convergence_20250908-150206.wav'),
    ('Optimized Additive with DE + cosine', r'rendered_audio\\optimized_output_additive_parm_de_cosine_20250911-145449.wav'),
    ('Optimized Additive with DE + euclidean', r'rendered_audio\\optimized_output_additive_parm_de_euclidean_20250911-150843.wav'),
    ('Optimized Additive with DE + itakura_saito', r'rendered_audio\\optimized_output_additive_parm_de_itakura_saito_20250911-142345.wav'),
    ('Optimized Additive with DE + kl', r'rendered_audio\\optimized_output_additive_parm_de_kl_20250911-153657.wav'),
    ('Optimized Additive with DE + manhattan', r'rendered_audio\\optimized_output_additive_parm_de_manhattan_20250911-152815.wav'),
    ('Optimized Additive with DE + mfcc', r'rendered_audio\\optimized_output_additive_parm_de_mfcc_20250911-140638.wav'),
    ('Optimized Additive with DE + pearson', r'rendered_audio\\optimized_output_additive_parm_de_pearson_20250911-130943.wav'),
    ('Optimized Additive with DE + spectral_convergence', r'rendered_audio\\optimized_output_additive_parm_de_spectral_convergence_20250911-143519.wav'),
    ('Optimized Additive with DA + cosine', r'rendered_audio\\optimized_output_additive_parm_da_cosine_20250911-145627.wav'),
    ('Optimized Additive with DA + euclidean', r'rendered_audio\\optimized_output_additive_parm_da_euclidean_20250911-151046.wav'),
    ('Optimized Additive with DA + itakura_saito', r'rendered_audio\\optimized_output_additive_parm_da_itakura_saito_20250911-142845.wav'),
    ('Optimized Additive with DA + kl', r'rendered_audio\\optimized_output_additive_parm_da_kl_20250911-154143.wav'),
    ('Optimized Additive with DA + manhattan', r'rendered_audio\\optimized_output_additive_parm_da_manhattan_20250911-153028.wav'),
    ('Optimized Additive with DA + mfcc', r'rendered_audio\\optimized_output_additive_parm_da_mfcc_20250911-141720.wav'),
    ('Optimized Additive with DA + pearson', r'rendered_audio\\optimized_output_additive_parm_da_pearson_20250911-131453.wav'),
    ('Optimized Additive with DA + spectral_convergence', r'rendered_audio\\optimized_output_additive_parm_da_spectral_convergence_20250911-143712.wav'),
    ('Optimized Additive with BH + cosine', r'rendered_audio\\optimized_output_additive_parm_bh_cosine_20250911-150507.wav'),
    ('Optimized Additive with BH + euclidean', r'rendered_audio\\optimized_output_additive_parm_bh_euclidean_20250911-152547.wav'),
    ('Optimized Additive with BH + itakura_saito', r'rendered_audio\\optimized_output_additive_parm_bh_itakura_saito_20250911-143206.wav'),
    ('Optimized Additive with BH + kl', r'rendered_audio\\optimized_output_additive_parm_bh_kl_20250911-155341.wav'),
    ('Optimized Additive with BH + manhattan', r'rendered_audio\\optimized_output_additive_parm_bh_manhattan_20250911-153357.wav'),
    ('Optimized Additive with BH + mfcc', r'rendered_audio\\optimized_output_additive_parm_bh_mfcc_20250911-142054.wav'),
    ('Optimized Additive with BH + pearson', r'rendered_audio\\optimized_output_additive_parm_bh_pearson_20250911-133118.wav'),
    ('Optimized Additive with BH + spectral_convergence', r'rendered_audio\\optimized_output_additive_parm_bh_spectral_convergence_20250911-145142.wav'),
]

entries = [{'title': t, 'audio': a} for (t, a) in candidates]

print('Using target =', target_audio)
print('Candidates =', len(entries))

Using target = rendered_audio\additive_from_parm_2.0s_20250908-142704.wav
Candidates = 48


In [5]:
def rms_normalize(x, eps=1e-12):
    return x / (np.sqrt(np.mean(x**2) + eps))

def align_signals(x, y, max_shift_s=0.05):
    # Brute-force short lag search (±max_shift) maximizing dot product
    max_shift = int(max_shift_s * SR)
    n = min(len(x), len(y))
    x0 = x[:n]
    best = (None, -np.inf)
    for lag in range(-max_shift, max_shift+1):
        if lag >= 0:
            xa = x0[:n-lag]
            ya = y[lag:lag+(n-lag)]
        else:
            xa = x0[-lag:n]
            ya = y[:n+lag]
        if len(xa) < 32:
            continue
        s = float(np.dot(xa, ya))
        if s > best[1]:
            best = (lag, s)
    lag = best[0] or 0
    if lag > 0:
        y_al = np.pad(y, (lag, 0))[:len(x)]
    elif lag < 0:
        y_al = y[-lag:len(x)-lag] if len(y) - (-lag) >= len(x) else np.pad(y[-lag:], (0, len(x) - (len(y) + lag)))
    else:
        y_al = y[:len(x)]
    n2 = min(len(x), len(y_al))
    return x[:n2], y_al[:n2]

def stft_mag(x):
    S = librosa.stft(x, n_fft=N_FFT, hop_length=HOP, window='hann')
    return np.abs(S) + EPS

def spectral_convergence(S, Sh):
    num = np.linalg.norm(S - Sh)
    den = np.linalg.norm(S) + EPS
    return num / den

def log_spectral_distance_db(S, Sh):
    SdB = librosa.amplitude_to_db(S, ref=1.0)
    ShdB = librosa.amplitude_to_db(Sh, ref=1.0)
    return float(np.sqrt(np.mean((SdB - ShdB)**2)))

def itakura_saito(P, Ph):
    # D_IS(P || Ph) = sum(P/Ph - log(P/Ph) - 1)
    R = (P / (Ph + EPS))
    return float(np.mean(R - np.log(R + EPS) - 1.0))

def mfcc_l2(x, y):
    Mx = librosa.feature.mfcc(y=x, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS)
    My = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS)
    m = min(Mx.shape[1], My.shape[1])
    return float(np.mean(np.linalg.norm(Mx[:, :m] - My[:, :m], axis=0)))

def eval_pair(x, y):
    x = rms_normalize(x)
    y = rms_normalize(y)
    x, y = align_signals(x, y, MAX_ALIGN_SEC)
    time_mse = float(np.mean((x - y)**2))
    Sx = stft_mag(x)
    Sy = stft_mag(y)
    lx = np.log(Sx + EPS).ravel()
    ly = np.log(Sy + EPS).ravel()
    cos_dist = float(cosine(lx, ly))
    # Pearson distance on log-magnitude spectra
    ax = lx - lx.mean(); ay = ly - ly.mean()
    pearson = float(np.dot(ax, ay) / ((np.linalg.norm(ax) * np.linalg.norm(ay)) + EPS))
    pearson_dist = float(1.0 - pearson)
    sc = float(spectral_convergence(Sx, Sy))
    lsd = float(log_spectral_distance_db(Sx, Sy))
    isd = float(itakura_saito(Sx**2, Sy**2))
    mfccd = float(mfcc_l2(x, y))
    return {
        'time_mse': time_mse,
        'cosine_logmag': cos_dist,
        'pearson_logmag': pearson_dist,
        'spectral_convergence': sc,
        'lsd_db': lsd,
        'itakura_saito': isd,
        'mfcc_l2': mfccd,
    }


In [6]:
# Extended metrics: spectral flatness, centroid/rolloff RMSE, multi-resolution STFT; override eval_pair
import numpy as np
import librosa


def spectral_flatness_distance(Sx: np.ndarray, Sy: np.ndarray) -> float:
    Fx = librosa.feature.spectral_flatness(S=Sx**2)
    Fy = librosa.feature.spectral_flatness(S=Sy**2)
    m = min(Fx.shape[1], Fy.shape[1])
    return float(np.mean(np.abs(Fx[:, :m] - Fy[:, :m])))


def centroid_rolloff_rmse(Sx: np.ndarray, Sy: np.ndarray) -> tuple[float, float]:
    Cx = librosa.feature.spectral_centroid(S=Sx, sr=SR)
    Cy = librosa.feature.spectral_centroid(S=Sy, sr=SR)
    Rx = librosa.feature.spectral_rolloff(S=Sx, sr=SR)
    Ry = librosa.feature.spectral_rolloff(S=Sy, sr=SR)
    m = min(Cx.shape[1], Cy.shape[1], Rx.shape[1], Ry.shape[1])
    cen_rmse = float(np.sqrt(np.mean((Cx[:, :m] - Cy[:, :m])**2)))
    rol_rmse = float(np.sqrt(np.mean((Rx[:, :m] - Ry[:, :m])**2)))
    return cen_rmse, rol_rmse


def mrstft_loss(x: np.ndarray, y: np.ndarray) -> float:
    configs = [
        (1024, 256),
        (2048, 512),
        (4096, 1024),
    ]
    scores: list[float] = []
    for nfft, hop in configs:
        Sx = np.abs(librosa.stft(x, n_fft=nfft, hop_length=hop, window='hann')) + EPS
        Sy = np.abs(librosa.stft(y, n_fft=nfft, hop_length=hop, window='hann')) + EPS
        l1_log = float(np.mean(np.abs(np.log(Sx) - np.log(Sy))))
        sc = float(spectral_convergence(Sx, Sy))
        scores.append(l1_log + sc)
    return float(np.mean(scores))


def mel_l1(x: np.ndarray, y: np.ndarray) -> float:
    Mx = librosa.feature.melspectrogram(y=x, sr=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, power=1.0)
    My = librosa.feature.melspectrogram(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, power=1.0)
    Mx = np.log(Mx + EPS)
    My = np.log(My + EPS)
    m = min(Mx.shape[1], My.shape[1])
    return float(np.mean(np.abs(Mx[:, :m] - My[:, :m])))


def eval_pair(x: np.ndarray, y: np.ndarray) -> dict:
    x = rms_normalize(x)
    y = rms_normalize(y)
    x, y = align_signals(x, y, MAX_ALIGN_SEC)

    time_mse = float(np.mean((x - y)**2))

    Sx = stft_mag(x)
    Sy = stft_mag(y)

    lx = np.log(Sx + EPS).ravel()
    ly = np.log(Sy + EPS).ravel()
    cos_dist = float(cosine(lx, ly))

    ax = lx - lx.mean(); ay = ly - ly.mean()
    pearson = float(np.dot(ax, ay) / ((np.linalg.norm(ax) * np.linalg.norm(ay)) + EPS))
    pearson_dist = float(1.0 - pearson)

    sc = float(spectral_convergence(Sx, Sy))
    lsd = float(log_spectral_distance_db(Sx, Sy))
    isd = float(itakura_saito(Sx**2, Sy**2))
    mfccd = float(mfcc_l2(x, y))

    sflat = float(spectral_flatness_distance(Sx, Sy))
    cen_rmse, rol_rmse = centroid_rolloff_rmse(Sx, Sy)
    mrstft = float(mrstft_loss(x, y))
    mel = float(mel_l1(x, y))
    mel_mrstft = float(0.5 * (mel + mrstft))

    return {
        'time_mse': time_mse,
        'cosine_logmag': cos_dist,
        'pearson_logmag': pearson_dist,
        'spectral_convergence': sc,
        'lsd_db': lsd,
        'itakura_saito': isd,
        'mfcc_l2': mfccd,
        'flatness_l1': sflat,
        'centroid_rmse_hz': cen_rmse,
        'rolloff_rmse_hz': rol_rmse,
        'mrstft': mrstft,
        'mel_l1': mel,
        'mel_mrstft': mel_mrstft,
    }



In [7]:
# Load target and evaluate all candidates

tx, _ = librosa.load(str(target_audio), sr=SR, mono=True)
rows = []
for e in entries:
    p = Path(e['audio'])
    if not p.exists():
        print('Missing audio:', p)
        continue
    y, _ = librosa.load(str(p), sr=SR, mono=True)
    mets = eval_pair(tx, y)
    row = {'title': e['title'], 'path': str(p).replace('\\', '/')}
    row.update(mets)
    rows.append(row)

df = pd.DataFrame(rows)
print('Evaluated', len(df), 'examples')
df.head()


Evaluated 48 examples


Unnamed: 0,title,path,time_mse,cosine_logmag,pearson_logmag,spectral_convergence,lsd_db,itakura_saito,mfcc_l2,flatness_l1,centroid_rmse_hz,rolloff_rmse_hz,mrstft,mel_l1,mel_mrstft
0,Optimized FM with DE + cosine,rendered_audio/optimized_output_fm_parm_de_cos...,1.396553,0.033571,0.314433,0.610332,6.812235,1757575000.0,123.454979,0.005780351,853.289192,1467.564415,1.718057,1.627066,1.672562
1,Optimized FM with DE + euclidean,rendered_audio/optimized_output_fm_parm_de_euc...,1.994336,1.915346,1.038531,1.379479,45.046986,2337.333,384.074432,0.0004956062,10518.462633,18651.809141,11.2275,8.997169,10.112334
2,Optimized FM with DE + itakura_saito,rendered_audio/optimized_output_fm_parm_de_ita...,1.999126,0.014365,0.326662,1.387054,6.903828,371.7216,88.570244,2.455377e-11,208.747702,436.637617,2.354476,0.955381,1.654928
3,Optimized FM with DE + kl,rendered_audio/optimized_output_fm_parm_de_kl_...,2.294999,0.034051,0.397494,0.570627,3.782495,13249800000.0,49.12262,0.0115607,1203.214858,2089.430828,1.591849,0.816037,1.203943
4,Optimized FM with DE + manhattan,rendered_audio/optimized_output_fm_parm_de_man...,1.98247,0.029026,0.460053,1.290187,6.081263,13249800000.0,82.686348,0.0115607,1207.749317,2103.949896,2.373658,1.049438,1.711548


In [8]:
# Rank by normalized metrics and compute composite
metrics = [
    'time_mse','cosine_logmag','pearson_logmag','spectral_convergence','lsd_db','itakura_saito','mfcc_l2',
    'flatness_l1','centroid_rmse_hz','rolloff_rmse_hz','mrstft','mel_l1','mel_mrstft'
]
df_rank = df.copy()
for m in metrics:
    if m not in df_rank.columns:
        continue
    lo, hi = df_rank[m].min(), df_rank[m].max()
    df_rank[m+'_norm'] = 0.5 if hi - lo < 1e-12 else (df_rank[m] - lo) / (hi - lo)
avail_norms = [m+'_norm' for m in metrics if m+'_norm' in df_rank.columns]
df_rank['composite'] = df_rank[avail_norms].mean(axis=1) if avail_norms else 0.0
df_rank_sorted = df_rank.sort_values('composite')
df_rank_sorted[['title','path','composite'] + [m for m in metrics if m in df_rank.columns]].head(10)


Unnamed: 0,title,path,composite,time_mse,cosine_logmag,pearson_logmag,spectral_convergence,lsd_db,itakura_saito,mfcc_l2,flatness_l1,centroid_rmse_hz,rolloff_rmse_hz,mrstft,mel_l1,mel_mrstft
12,Optimized FM with DA + manhattan,rendered_audio/optimized_output_fm_parm_da_man...,0.070914,1.95528,0.008844,0.205296,0.7317,3.095472,38.2128,54.082687,9.479741e-18,35.877562,64.141598,1.664384,0.727224,1.195804
29,Optimized Additive with DE + mfcc,rendered_audio/optimized_output_additive_parm_...,0.077959,1.935445,0.015112,0.288492,0.746724,2.936682,20.00315,36.673431,9.08948e-18,7.593332,65.546929,1.749811,0.928023,1.338917
0,Optimized FM with DE + cosine,rendered_audio/optimized_output_fm_parm_de_cos...,0.079612,1.396553,0.033571,0.314433,0.610332,6.812235,1757575000.0,123.454979,0.005780351,853.289192,1467.564415,1.718057,1.627066,1.672562
13,Optimized FM with DA + mfcc,rendered_audio/optimized_output_fm_parm_da_mfc...,0.09938,1.964976,0.023552,0.376483,0.750618,3.20192,3141761000.0,35.14378,0.0115607,868.06804,1512.460499,1.807196,0.756533,1.281865
3,Optimized FM with DE + kl,rendered_audio/optimized_output_fm_parm_de_kl_...,0.108388,2.294999,0.034051,0.397494,0.570627,3.782495,13249800000.0,49.12262,0.0115607,1203.214858,2089.430828,1.591849,0.816037,1.203943
6,Optimized FM with DE + pearson,rendered_audio/optimized_output_fm_parm_de_pea...,0.120708,2.464465,0.037416,0.394737,0.466267,6.275246,13549710000.0,111.540894,0.005780352,870.257092,1536.178328,1.584145,1.583976,1.58406
37,Optimized Additive with DA + mfcc,rendered_audio/optimized_output_additive_parm_...,0.123095,1.996864,0.028944,0.470561,0.92105,3.314882,13549710000.0,40.161514,0.005780352,848.949814,1447.084667,1.949445,0.635593,1.292519
5,Optimized FM with DE + mfcc,rendered_audio/optimized_output_fm_parm_de_mfc...,0.139709,2.005195,0.033433,0.468728,1.091806,3.50098,482.5281,26.456501,7.277687e-10,278.305453,646.336221,2.44573,1.594762,2.020246
10,Optimized FM with DA + itakura_saito,rendered_audio/optimized_output_fm_parm_da_ita...,0.155799,1.999149,0.014336,0.326036,1.387034,6.903105,372.4639,88.566833,2.447114e-11,208.70728,436.637617,2.354393,0.955874,1.655133
2,Optimized FM with DE + itakura_saito,rendered_audio/optimized_output_fm_parm_de_ita...,0.155851,1.999126,0.014365,0.326662,1.387054,6.903828,371.7216,88.570244,2.455377e-11,208.747702,436.637617,2.354476,0.955381,1.654928


In [9]:
# Save CSV summary for reference (include all metrics + composite)
out_csv = Path('examples/parm_eval.csv')
df_rank_sorted.to_csv(out_csv, index=False)
print('Saved:', out_csv.resolve())


Saved: C:\Users\egorp\Nextcloud\code\public_repos\FFTimbre\examples\parm_eval.csv


In [10]:
# Quick sanity checks: metrics on identical signals should be ~0.
from IPython.display import display

x, _ = librosa.load(str(target_audio), sr=SR, mono=True)
xe = x.copy()
metrics_ident = eval_pair(x, xe)
print('Target vs Target metrics:')
display(pd.Series(metrics_ident))

# Also test a trivial mismatch (silence) for non-zero
sil = np.zeros_like(x)
metrics_sil = eval_pair(x, sil)
print('Target vs Silence metrics:')
display(pd.Series(metrics_sil))



Target vs Target metrics:


time_mse                0.000000e+00
cosine_logmag           0.000000e+00
pearson_logmag          2.220446e-16
spectral_convergence    0.000000e+00
lsd_db                  0.000000e+00
itakura_saito           1.908595e-04
mfcc_l2                 0.000000e+00
flatness_l1             0.000000e+00
centroid_rmse_hz        0.000000e+00
rolloff_rmse_hz         0.000000e+00
mrstft                  0.000000e+00
mel_l1                  0.000000e+00
mel_mrstft              0.000000e+00
dtype: float64

Target vs Silence metrics:


time_mse                1.000000e+00
cosine_logmag           2.334947e-02
pearson_logmag          1.000000e+00
spectral_convergence    1.000000e+00
lsd_db                  7.847701e+01
itakura_saito           7.663294e+12
mfcc_l2                 5.484642e+02
flatness_l1             1.000001e+00
centroid_rmse_hz        1.095298e+04
rolloff_rmse_hz         1.864825e+04
mrstft                  1.675741e+01
mel_l1                  1.394487e+01
mel_mrstft              1.535114e+01
dtype: float64

## Interactive exploration of evaluation results

Load `examples/parm_eval.csv` and explore metrics and composite with filters, sorting, and plots. Audio players for selected Top K are shown below the chart.


In [None]:
# Interactive explorer (no Plotly); adds inline audio players for selected Top K
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try:
    import ipywidgets as W
    from IPython.display import display, clear_output, Audio
    HAS_WIDGETS = True
except Exception:
    HAS_WIDGETS = False

csv_path = Path('examples/parm_eval.csv')
assert csv_path.exists(), f'Missing {csv_path}'
base_df = pd.read_csv(csv_path)

# Identify metric columns and ensure composite is present if available
known_metrics = ['time_mse','cosine_logmag','pearson_logmag','spectral_convergence','lsd_db','itakura_saito','mfcc_l2','flatness_l1','centroid_rmse_hz','rolloff_rmse_hz','mrstft','mel_l1','mel_mrstft']
metric_cols = [c for c in base_df.columns if c in known_metrics]
extra_cols = [c for c in ['composite'] if c in base_df.columns]
metrics_all = extra_cols + metric_cols

if not HAS_WIDGETS:
    print('ipywidgets not available; showing static tables/plots')
    display(base_df.head())
else:
    # Controls
    metric_dd = W.Dropdown(options=metrics_all or metric_cols, value=(metrics_all or metric_cols)[0], description='Metric:')
    sort_dir = W.ToggleButtons(options=['asc','desc'], value='asc', description='Sort:')
    filter_text = W.Text(value='', description='Filter title:')
    top_k = W.IntSlider(value=10, min=5, max=max(10, len(base_df)), step=1, description='Top K:')
    normalize = W.Checkbox(value=False, description='Normalize 0..1 (per-metric)')

    ui = W.VBox([
        W.HBox([metric_dd, sort_dir, normalize]),
        W.HBox([filter_text, top_k])
    ])

    out = W.Output()

    def render(_=None):
        with out:
            clear_output(wait=True)
            df = base_df.copy()
            if filter_text.value.strip():
                q = filter_text.value.strip().lower()
                df = df[df['title'].str.lower().str.contains(q)]
            m = metric_dd.value
            if normalize.value:
                lo, hi = df[m].min(), df[m].max()
                df['_val'] = 0.5 if hi - lo < 1e-12 else (df[m] - lo) / (hi - lo)
            else:
                df['_val'] = df[m]
            ascending = (sort_dir.value == 'asc')
            df = df.sort_values('_val', ascending=ascending).head(top_k.value)

            # Show table
            display(df[['title','path', m]].rename(columns={m: f'{m}'}))

            # Matplotlib bar
            plt.figure(figsize=(10, max(3, 0.4*len(df))))
            plt.barh(df['title'], df['_val'], color='steelblue')
            plt.gca().invert_yaxis()
            plt.xlabel(f'{m} (normalized)' if normalize.value else m)
            plt.title('Top results')
            plt.tight_layout()
            plt.show()

            # Inline audio players (aligned to target duration)
            # Robustly select a valid target path to avoid deprecated audioread fallback
            _target_candidate = None
            if 'target_path' in base_df.columns and len(base_df['target_path']) > 0:
                _cand = str(base_df['target_path'].iloc[0])
                if isinstance(_cand, str) and _cand.strip() and Path(_cand).is_file():
                    _target_candidate = Path(_cand)
            if _target_candidate is None:
                _target_candidate = Path('rendered_audio/additive_from_cello_single_2.0s_20250906-215542.wav')
            target, _ = librosa.load(str(_target_candidate), sr=SR, mono=True)
            display(W.HTML('<b>Audio (in current order):</b>'))
            for _, r in df.iterrows():
                try:
                    y, _ = librosa.load(r['path'], sr=SR, mono=True)
                    _, y_al = align_signals(target, y, MAX_ALIGN_SEC)
                    y_al = np.nan_to_num(y_al, nan=0.0, posinf=0.0, neginf=0.0)
                    display(W.HTML(f"<div style='margin:6px 0'><b>{r['title']}</b> &nbsp; <small>{r['path']}</small></div>"))
                    display(Audio(y_al, rate=SR, normalize=False))
                except Exception as ex:
                    display(W.HTML(f"<div style='color:#b00'>Audio load failed: {r['path']} ({ex})</div>"))

    for w in [metric_dd, sort_dir, filter_text, top_k, normalize]:
        w.observe(render, names='value')

    display(ui, out)
    render()



VBox(children=(HBox(children=(Dropdown(description='Metric:', options=('composite', 'time_mse', 'cosine_logmag…

Output()