# Cello Resynthesis Evaluation

Evaluates FM and Additive candidate audio files against a target cello signal.

## Metrics used (lower is better unless noted)
- Time MSE: Mean-squared error in time domain after RMS normalization and short-window alignment.
- Cosine distance (log-magnitude STFT): Cosine distance of flattened log |STFT|.
- Pearson distance (log-magnitude STFT): 1 − Pearson correlation of flattened log |STFT|.
- Spectral convergence: ||S − Ŝ|| / (||S|| + ε); emphasizes relative spectral errors.
- Log-spectral distance (dB): RMSE between amplitude-in-dB spectra; classic perceptual scale.
- Itakura–Saito divergence (power): D_IS(S² || Ŝ²); scale-sensitive spectral mismatch.
- MFCC L2: L2 distance of MFCC sequences (aligned in time to min length).
- Spectral flatness L1: Mean absolute difference of spectral flatness over time.
- Centroid RMSE (Hz): RMSE of spectral centroid trajectories.
- Rolloff RMSE (Hz): RMSE of spectral rolloff trajectories.
- Multi-resolution STFT (MR-STFT): Mean of log-magnitude L1 + spectral convergence across several STFT configs.
- Log–mel L1: L1 distance between log-mel spectrograms.
- Combined mel_mrstft: 0.5 · (log–mel L1 + MR-STFT).

Composite (if shown) is the unweighted mean of normalized metrics present.


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import librosa, librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

SR = 44100
N_FFT = 2048
HOP = 512
N_MELS = 64
N_MFCC = 20
MAX_ALIGN_SEC = 0.05
EPS = 1e-10

print('Using explicit candidate list; no markdown parsing')


Using explicit candidate list; no markdown parsing


In [2]:
# Define explicit target and candidate file list

target_audio = Path(r'rendered_audio\\additive_from_cello_single_2.0s_20250906-215542.wav')

candidates = [
    # FM (BH/DA/DE)
    ('FM BH cosine', r'rendered_audio\\optimized_output_fm_cello_single_bh_cosine_20250904-003840.wav'),
    ('FM BH euclidean', r'rendered_audio\\optimized_output_fm_cello_single_bh_euclidean_20250904-004138.wav'),
    ('FM BH itakura_saito', r'rendered_audio\\optimized_output_fm_cello_single_bh_itakura_saito_20250904-003244.wav'),
    ('FM BH kl', r'rendered_audio\\optimized_output_fm_cello_single_bh_kl_20250904-004529.wav'),
    ('FM BH manhattan', r'rendered_audio\\optimized_output_fm_cello_single_bh_manhattan_20250904-004305.wav'),
    ('FM BH mfcc', r'rendered_audio\\optimized_output_fm_cello_single_bh_mfcc_20250904-003015.wav'),
    ('FM BH pearson', r'rendered_audio\\optimized_output_fm_cello_single_bh_pearson_20250904-002243.wav'),
    ('FM BH spectral_convergence', r'rendered_audio\\optimized_output_fm_cello_single_bh_spectral_convergence_20250904-003533.wav'),

    ('FM DA cosine', r'rendered_audio\\optimized_output_fm_cello_single_da_cosine_20250904-003801.wav'),
    ('FM DA euclidean', r'rendered_audio\\optimized_output_fm_cello_single_da_euclidean_20250904-004117.wav'),
    ('FM DA itakura_saito', r'rendered_audio\\optimized_output_fm_cello_single_da_itakura_saito_20250904-003210.wav'),
    ('FM DA kl', r'rendered_audio\\optimized_output_fm_cello_single_da_kl_20250904-004455.wav'),
    ('FM DA manhattan', r'rendered_audio\\optimized_output_fm_cello_single_da_manhattan_20250904-004233.wav'),
    ('FM DA mfcc', r'rendered_audio\\optimized_output_fm_cello_single_da_mfcc_20250904-002818.wav'),
    ('FM DA pearson', r'rendered_audio\\optimized_output_fm_cello_single_da_pearson_20250904-002136.wav'),
    ('FM DA spectral_convergence', r'rendered_audio\\optimized_output_fm_cello_single_da_spectral_convergence_20250904-003515.wav'),

    ('FM DE cosine', r'rendered_audio\\optimized_output_fm_cello_single_de_cosine_20250904-003735.wav'),
    ('FM DE euclidean', r'rendered_audio\\optimized_output_fm_cello_single_de_euclidean_20250904-004049.wav'),
    ('FM DE itakura_saito', r'rendered_audio\\optimized_output_fm_cello_single_de_itakura_saito_20250904-003136.wav'),
    ('FM DE kl', r'rendered_audio\\optimized_output_fm_cello_single_de_kl_20250904-004429.wav'),
    ('FM DE manhattan', r'rendered_audio\\optimized_output_fm_cello_single_de_manhattan_20250904-004211.wav'),
    ('FM DE mfcc', r'rendered_audio\\optimized_output_fm_cello_single_de_mfcc_20250904-002651.wav'),
    ('FM DE pearson', r'rendered_audio\\optimized_output_fm_cello_single_de_pearson_20250904-002050.wav'),
    ('FM DE spectral_convergence', r'rendered_audio\\optimized_output_fm_cello_single_de_spectral_convergence_20250904-003453.wav'),

    # Additive (BH/DA/DE)
    ('Add BH cosine', r'rendered_audio\\optimized_output_additive_cello_single_bh_cosine_20250911-051747.wav'),
    ('Add BH euclidean', r'rendered_audio\\optimized_output_additive_cello_single_bh_euclidean_20250911-053953.wav'),
    ('Add BH itakura_saito', r'rendered_audio\\optimized_output_additive_cello_single_bh_itakura_saito_20250911-044808.wav'),
    ('Add BH kl', r'rendered_audio\\optimized_output_additive_cello_single_bh_kl_20250911-060219.wav'),
    ('Add BH manhattan', r'rendered_audio\\optimized_output_additive_cello_single_bh_manhattan_20250911-054610.wav'),
    ('Add BH mfcc', r'rendered_audio\\optimized_output_additive_cello_single_bh_mfcc_20250911-043942.wav'),
    ('Add BH pearson', r'rendered_audio\\optimized_output_additive_cello_single_bh_pearson_20250911-034659.wav'),
    ('Add BH spectral_convergence', r'rendered_audio\\optimized_output_additive_cello_single_bh_spectral_convergence_20250911-050404.wav'),

    ('Add DA cosine', r'rendered_audio\\optimized_output_additive_cello_single_da_cosine_20250911-050845.wav'),
    ('Add DA euclidean', r'rendered_audio\\optimized_output_additive_cello_single_da_euclidean_20250911-052230.wav'),
    ('Add DA itakura_saito', r'rendered_audio\\optimized_output_additive_cello_single_da_itakura_saito_20250911-044510.wav'),
    ('Add DA kl', r'rendered_audio\\optimized_output_additive_cello_single_da_kl_20250911-055207.wav'),
    ('Add DA manhattan', r'rendered_audio\\optimized_output_additive_cello_single_da_manhattan_20250911-054410.wav'),
    ('Add DA mfcc', r'rendered_audio\\optimized_output_additive_cello_single_da_mfcc_20250911-043445.wav'),
    ('Add DA pearson', r'rendered_audio\\optimized_output_additive_cello_single_da_pearson_20250911-033105.wav'),
    ('Add DA spectral_convergence', r'rendered_audio\\optimized_output_additive_cello_single_da_spectral_convergence_20250911-045258.wav'),

    ('Add DE cosine', r'rendered_audio\\optimized_output_additive_cello_single_de_cosine_20250911-050644.wav'),
    ('Add DE euclidean', r'rendered_audio\\optimized_output_additive_cello_single_de_euclidean_20250911-052106.wav'),
    ('Add DE itakura_saito', r'rendered_audio\\optimized_output_additive_cello_single_de_itakura_saito_20250911-044231.wav'),
    ('Add DE kl', r'rendered_audio\\optimized_output_additive_cello_single_de_kl_20250911-054856.wav'),
    ('Add DE manhattan', r'rendered_audio\\optimized_output_additive_cello_single_de_manhattan_20250911-054218.wav'),
    ('Add DE mfcc', r'rendered_audio\\optimized_output_additive_cello_single_de_mfcc_20250911-042920.wav'),
    ('Add DE pearson', r'rendered_audio\\optimized_output_additive_cello_single_de_pearson_20250911-032644.wav'),
    ('Add DE spectral_convergence', r'rendered_audio\\optimized_output_additive_cello_single_de_spectral_convergence_20250911-045113.wav'),
]

entries = [{'title': t, 'audio': a} for (t, a) in candidates]

print('Using target =', target_audio)
print('Candidates =', len(entries))


Using target = rendered_audio\additive_from_cello_single_2.0s_20250906-215542.wav
Candidates = 48


In [3]:
def rms_normalize(x, eps=1e-12):
    return x / (np.sqrt(np.mean(x**2) + eps))

def align_signals(x, y, max_shift_s=0.05):
    # Brute-force short lag search (±max_shift) maximizing dot product
    max_shift = int(max_shift_s * SR)
    n = min(len(x), len(y))
    x0 = x[:n]
    best = (None, -np.inf)
    for lag in range(-max_shift, max_shift+1):
        if lag >= 0:
            xa = x0[:n-lag]
            ya = y[lag:lag+(n-lag)]
        else:
            xa = x0[-lag:n]
            ya = y[:n+lag]
        if len(xa) < 32:
            continue
        s = float(np.dot(xa, ya))
        if s > best[1]:
            best = (lag, s)
    lag = best[0] or 0
    if lag > 0:
        y_al = np.pad(y, (lag, 0))[:len(x)]
    elif lag < 0:
        y_al = y[-lag:len(x)-lag] if len(y) - (-lag) >= len(x) else np.pad(y[-lag:], (0, len(x) - (len(y) + lag)))
    else:
        y_al = y[:len(x)]
    n2 = min(len(x), len(y_al))
    return x[:n2], y_al[:n2]

def stft_mag(x):
    S = librosa.stft(x, n_fft=N_FFT, hop_length=HOP, window='hann')
    return np.abs(S) + EPS

def spectral_convergence(S, Sh):
    num = np.linalg.norm(S - Sh)
    den = np.linalg.norm(S) + EPS
    return num / den

def log_spectral_distance_db(S, Sh):
    SdB = librosa.amplitude_to_db(S, ref=1.0)
    ShdB = librosa.amplitude_to_db(Sh, ref=1.0)
    return float(np.sqrt(np.mean((SdB - ShdB)**2)))

def itakura_saito(P, Ph):
    # D_IS(P || Ph) = sum(P/Ph - log(P/Ph) - 1)
    R = (P / (Ph + EPS))
    return float(np.mean(R - np.log(R + EPS) - 1.0))

def mfcc_l2(x, y):
    Mx = librosa.feature.mfcc(y=x, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS)
    My = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS)
    m = min(Mx.shape[1], My.shape[1])
    return float(np.mean(np.linalg.norm(Mx[:, :m] - My[:, :m], axis=0)))

def eval_pair(x, y):
    x = rms_normalize(x)
    y = rms_normalize(y)
    x, y = align_signals(x, y, MAX_ALIGN_SEC)
    time_mse = float(np.mean((x - y)**2))
    Sx = stft_mag(x)
    Sy = stft_mag(y)
    lx = np.log(Sx + EPS).ravel()
    ly = np.log(Sy + EPS).ravel()
    cos_dist = float(cosine(lx, ly))
    # Pearson distance on log-magnitude spectra
    ax = lx - lx.mean(); ay = ly - ly.mean()
    pearson = float(np.dot(ax, ay) / ((np.linalg.norm(ax) * np.linalg.norm(ay)) + EPS))
    pearson_dist = float(1.0 - pearson)
    sc = float(spectral_convergence(Sx, Sy))
    lsd = float(log_spectral_distance_db(Sx, Sy))
    isd = float(itakura_saito(Sx**2, Sy**2))
    mfccd = float(mfcc_l2(x, y))
    return {
        'time_mse': time_mse,
        'cosine_logmag': cos_dist,
        'pearson_logmag': pearson_dist,
        'spectral_convergence': sc,
        'lsd_db': lsd,
        'itakura_saito': isd,
        'mfcc_l2': mfccd,
    }


In [4]:
# Extended metrics: spectral flatness, centroid/rolloff RMSE, multi-resolution STFT; override eval_pair
import numpy as np
import librosa


def spectral_flatness_distance(Sx: np.ndarray, Sy: np.ndarray) -> float:
    Fx = librosa.feature.spectral_flatness(S=Sx**2)
    Fy = librosa.feature.spectral_flatness(S=Sy**2)
    m = min(Fx.shape[1], Fy.shape[1])
    return float(np.mean(np.abs(Fx[:, :m] - Fy[:, :m])))


def centroid_rolloff_rmse(Sx: np.ndarray, Sy: np.ndarray) -> tuple[float, float]:
    Cx = librosa.feature.spectral_centroid(S=Sx, sr=SR)
    Cy = librosa.feature.spectral_centroid(S=Sy, sr=SR)
    Rx = librosa.feature.spectral_rolloff(S=Sx, sr=SR)
    Ry = librosa.feature.spectral_rolloff(S=Sy, sr=SR)
    m = min(Cx.shape[1], Cy.shape[1], Rx.shape[1], Ry.shape[1])
    cen_rmse = float(np.sqrt(np.mean((Cx[:, :m] - Cy[:, :m])**2)))
    rol_rmse = float(np.sqrt(np.mean((Rx[:, :m] - Ry[:, :m])**2)))
    return cen_rmse, rol_rmse


def mrstft_loss(x: np.ndarray, y: np.ndarray) -> float:
    configs = [
        (1024, 256),
        (2048, 512),
        (4096, 1024),
    ]
    scores: list[float] = []
    for nfft, hop in configs:
        Sx = np.abs(librosa.stft(x, n_fft=nfft, hop_length=hop, window='hann')) + EPS
        Sy = np.abs(librosa.stft(y, n_fft=nfft, hop_length=hop, window='hann')) + EPS
        l1_log = float(np.mean(np.abs(np.log(Sx) - np.log(Sy))))
        sc = float(spectral_convergence(Sx, Sy))
        scores.append(l1_log + sc)
    return float(np.mean(scores))


def mel_l1(x: np.ndarray, y: np.ndarray) -> float:
    Mx = librosa.feature.melspectrogram(y=x, sr=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, power=1.0)
    My = librosa.feature.melspectrogram(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, power=1.0)
    Mx = np.log(Mx + EPS)
    My = np.log(My + EPS)
    m = min(Mx.shape[1], My.shape[1])
    return float(np.mean(np.abs(Mx[:, :m] - My[:, :m])))


def eval_pair(x: np.ndarray, y: np.ndarray) -> dict:
    x = rms_normalize(x)
    y = rms_normalize(y)
    x, y = align_signals(x, y, MAX_ALIGN_SEC)

    time_mse = float(np.mean((x - y)**2))

    Sx = stft_mag(x)
    Sy = stft_mag(y)

    lx = np.log(Sx + EPS).ravel()
    ly = np.log(Sy + EPS).ravel()
    cos_dist = float(cosine(lx, ly))

    ax = lx - lx.mean(); ay = ly - ly.mean()
    pearson = float(np.dot(ax, ay) / ((np.linalg.norm(ax) * np.linalg.norm(ay)) + EPS))
    pearson_dist = float(1.0 - pearson)

    sc = float(spectral_convergence(Sx, Sy))
    lsd = float(log_spectral_distance_db(Sx, Sy))
    isd = float(itakura_saito(Sx**2, Sy**2))
    mfccd = float(mfcc_l2(x, y))

    sflat = float(spectral_flatness_distance(Sx, Sy))
    cen_rmse, rol_rmse = centroid_rolloff_rmse(Sx, Sy)
    mrstft = float(mrstft_loss(x, y))
    mel = float(mel_l1(x, y))
    mel_mrstft = float(0.5 * (mel + mrstft))

    return {
        'time_mse': time_mse,
        'cosine_logmag': cos_dist,
        'pearson_logmag': pearson_dist,
        'spectral_convergence': sc,
        'lsd_db': lsd,
        'itakura_saito': isd,
        'mfcc_l2': mfccd,
        'flatness_l1': sflat,
        'centroid_rmse_hz': cen_rmse,
        'rolloff_rmse_hz': rol_rmse,
        'mrstft': mrstft,
        'mel_l1': mel,
        'mel_mrstft': mel_mrstft,
    }



In [5]:
# Load target and evaluate all candidates

tx, _ = librosa.load(str(target_audio), sr=SR, mono=True)
rows = []
for e in entries:
    p = Path(e['audio'])
    if not p.exists():
        print('Missing audio:', p)
        continue
    y, _ = librosa.load(str(p), sr=SR, mono=True)
    mets = eval_pair(tx, y)
    row = {'title': e['title'], 'path': str(p).replace('\\', '/')}
    row.update(mets)
    rows.append(row)

df = pd.DataFrame(rows)
print('Evaluated', len(df), 'examples')
df.head()


Evaluated 48 examples


Unnamed: 0,title,path,time_mse,cosine_logmag,pearson_logmag,spectral_convergence,lsd_db,itakura_saito,mfcc_l2,flatness_l1,centroid_rmse_hz,rolloff_rmse_hz,mrstft,mel_l1,mel_mrstft
0,FM BH cosine,rendered_audio/optimized_output_fm_cello_singl...,1.972438,1.450099,1.019511,1.356645,37.782352,22408.74,363.784058,0.0008497111,10665.954341,18210.449825,9.491282,7.125466,8.308374
1,FM BH euclidean,rendered_audio/optimized_output_fm_cello_singl...,1.988333,1.24571,0.975623,1.391194,34.697861,5601022000.0,367.199249,0.006092325,10787.628922,18331.672039,9.33908,7.070755,8.204918
2,FM BH itakura_saito,rendered_audio/optimized_output_fm_cello_singl...,1.999986,0.091289,0.997696,1.414149,11.773578,3197672000.0,267.81134,1.287249e-12,2270.123588,2091.479663,3.413335,4.159803,3.786569
3,FM BH kl,rendered_audio/optimized_output_fm_cello_singl...,1.977271,1.763306,1.050789,1.353393,48.027313,10734540000.0,375.450287,0.08225173,10686.799117,18257.308644,11.116738,7.488438,9.302588
4,FM BH manhattan,rendered_audio/optimized_output_fm_cello_singl...,1.0,0.054147,1.0,1.0,79.545364,7639641000000.0,629.558044,1.000001,10753.50102,18324.897368,17.296992,15.936289,16.61664


In [6]:
# Rank by normalized metrics and compute composite
metrics = [
    'time_mse','cosine_logmag','pearson_logmag','spectral_convergence','lsd_db','itakura_saito','mfcc_l2',
    'flatness_l1','centroid_rmse_hz','rolloff_rmse_hz','mrstft','mel_l1','mel_mrstft'
]
df_rank = df.copy()
for m in metrics:
    if m not in df_rank.columns:
        continue
    lo, hi = df_rank[m].min(), df_rank[m].max()
    df_rank[m+'_norm'] = 0.5 if hi - lo < 1e-12 else (df_rank[m] - lo) / (hi - lo)
avail_norms = [m+'_norm' for m in metrics if m+'_norm' in df_rank.columns]
df_rank['composite'] = df_rank[avail_norms].mean(axis=1) if avail_norms else 0.0
df_rank_sorted = df_rank.sort_values('composite')
df_rank_sorted[['title','path','composite'] + [m for m in metrics if m in df_rank.columns]].head(10)


Unnamed: 0,title,path,composite,time_mse,cosine_logmag,pearson_logmag,spectral_convergence,lsd_db,itakura_saito,mfcc_l2,flatness_l1,centroid_rmse_hz,rolloff_rmse_hz,mrstft,mel_l1,mel_mrstft
16,FM DE cosine,rendered_audio/optimized_output_fm_cello_singl...,0.034736,0.237791,0.033117,0.246917,0.385975,7.513193,188.304,86.038124,8.692132e-18,39.97359,129.45827,1.531183,1.712121,1.621652
22,FM DE pearson,rendered_audio/optimized_output_fm_cello_singl...,0.037302,0.260248,0.034393,0.24982,0.381339,7.590257,103.6142,91.119659,3.75438e-17,46.707802,161.944425,1.551195,1.782301,1.666748
32,Add DA cosine,rendered_audio/optimized_output_additive_cello...,0.102211,0.033639,0.087348,0.59139,0.154265,8.994841,23693.51,161.042908,9.265712e-14,589.319168,131.328765,4.259987,3.976761,4.118374
19,FM DE kl,rendered_audio/optimized_output_fm_cello_singl...,0.111262,1.765575,0.046981,0.364524,0.367659,5.072733,48455920000.0,60.033226,0.0115607,1160.099264,1993.63889,1.461437,1.120882,1.29116
14,FM DA pearson,rendered_audio/optimized_output_fm_cello_singl...,0.143899,0.147503,0.139118,0.764183,0.371465,10.599982,789133.4,187.353546,8.093867e-15,499.68528,259.15452,3.987109,4.544885,4.265997
38,Add DA pearson,rendered_audio/optimized_output_additive_cello...,0.144614,0.347185,0.156722,0.802019,0.225534,7.913066,48456000000.0,153.919098,0.0117586,1171.899381,1983.015463,3.491147,3.711181,3.601164
44,Add DE manhattan,rendered_audio/optimized_output_additive_cello...,0.165533,1.993767,0.027648,0.303855,1.093723,8.502432,1922298.0,166.401077,1.98625e-15,143.818916,301.455953,2.200386,2.00546,2.102923
21,FM DE mfcc,rendered_audio/optimized_output_fm_cello_singl...,0.180264,1.978077,0.057147,0.406032,1.315893,5.744286,48455920000.0,33.42149,0.01156079,1181.727191,2055.307378,2.527757,1.243237,1.885497
45,Add DE mfcc,rendered_audio/optimized_output_additive_cello...,0.181254,1.999708,0.046595,0.344955,1.392127,6.9145,63524.91,51.031601,1.299377e-10,172.675921,322.383413,3.197579,2.524543,2.861061
43,Add DE kl,rendered_audio/optimized_output_additive_cello...,0.187242,0.313262,0.478078,0.415823,0.336848,19.476889,12.7355,200.551849,6.894209e-10,1891.992865,4015.972999,6.267578,5.136194,5.701886


In [8]:
# Save CSV summary for reference (include all metrics + composite)
out_csv = Path('examples/cello_eval.csv')
df_rank_sorted.to_csv(out_csv, index=False)
print('Saved:', out_csv.resolve())


Saved: C:\Users\egorp\Nextcloud\code\public_repos\FFTimbre\examples\cello_eval.csv


In [9]:
# Quick sanity checks: metrics on identical signals should be ~0.
from IPython.display import display

x, _ = librosa.load(str(target_audio), sr=SR, mono=True)
xe = x.copy()
metrics_ident = eval_pair(x, xe)
print('Target vs Target metrics:')
display(pd.Series(metrics_ident))

# Also test a trivial mismatch (silence) for non-zero
sil = np.zeros_like(x)
metrics_sil = eval_pair(x, sil)
print('Target vs Silence metrics:')
display(pd.Series(metrics_sil))



Target vs Target metrics:


time_mse                0.000000e+00
cosine_logmag           0.000000e+00
pearson_logmag          6.590777e-08
spectral_convergence    0.000000e+00
lsd_db                  0.000000e+00
itakura_saito           1.091760e-04
mfcc_l2                 0.000000e+00
flatness_l1             0.000000e+00
centroid_rmse_hz        0.000000e+00
rolloff_rmse_hz         0.000000e+00
mrstft                  0.000000e+00
mel_l1                  0.000000e+00
mel_mrstft              0.000000e+00
dtype: float64

Target vs Silence metrics:


time_mse                9.999998e-01
cosine_logmag           5.414735e-02
pearson_logmag          1.000000e+00
spectral_convergence    1.000000e+00
lsd_db                  7.954536e+01
itakura_saito           7.639641e+12
mfcc_l2                 6.295580e+02
flatness_l1             1.000001e+00
centroid_rmse_hz        1.075350e+04
rolloff_rmse_hz         1.832490e+04
mrstft                  1.729699e+01
mel_l1                  1.593629e+01
mel_mrstft              1.661664e+01
dtype: float64

## Interactive exploration of evaluation results

Load `examples/cello_eval.csv` and explore metrics and composite with filters, sorting, and plots. Audio players for selected Top K are shown below the chart.


In [10]:
# Interactive explorer (no Plotly); adds inline audio players for selected Top K
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try:
    import ipywidgets as W
    from IPython.display import display, clear_output, Audio
    HAS_WIDGETS = True
except Exception:
    HAS_WIDGETS = False

csv_path = Path('examples/cello_eval.csv')
assert csv_path.exists(), f'Missing {csv_path}'
base_df = pd.read_csv(csv_path)

# Identify metric columns and ensure composite is present if available
known_metrics = ['time_mse','cosine_logmag','pearson_logmag','spectral_convergence','lsd_db','itakura_saito','mfcc_l2','flatness_l1','centroid_rmse_hz','rolloff_rmse_hz','mrstft','mel_l1','mel_mrstft']
metric_cols = [c for c in base_df.columns if c in known_metrics]
extra_cols = [c for c in ['composite'] if c in base_df.columns]
metrics_all = extra_cols + metric_cols

if not HAS_WIDGETS:
    print('ipywidgets not available; showing static tables/plots')
    display(base_df.head())
else:
    # Controls
    metric_dd = W.Dropdown(options=metrics_all or metric_cols, value=(metrics_all or metric_cols)[0], description='Metric:')
    sort_dir = W.ToggleButtons(options=['asc','desc'], value='asc', description='Sort:')
    filter_text = W.Text(value='', description='Filter title:')
    top_k = W.IntSlider(value=10, min=5, max=max(10, len(base_df)), step=1, description='Top K:')
    normalize = W.Checkbox(value=False, description='Normalize 0..1 (per-metric)')

    ui = W.VBox([
        W.HBox([metric_dd, sort_dir, normalize]),
        W.HBox([filter_text, top_k])
    ])

    out = W.Output()

    def render(_=None):
        with out:
            clear_output(wait=True)
            df = base_df.copy()
            if filter_text.value.strip():
                q = filter_text.value.strip().lower()
                df = df[df['title'].str.lower().str.contains(q)]
            m = metric_dd.value
            if normalize.value:
                lo, hi = df[m].min(), df[m].max()
                df['_val'] = 0.5 if hi - lo < 1e-12 else (df[m] - lo) / (hi - lo)
            else:
                df['_val'] = df[m]
            ascending = (sort_dir.value == 'asc')
            df = df.sort_values('_val', ascending=ascending).head(top_k.value)

            # Show table
            display(df[['title','path', m]].rename(columns={m: f'{m}'}))

            # Matplotlib bar
            plt.figure(figsize=(10, max(3, 0.4*len(df))))
            plt.barh(df['title'], df['_val'], color='steelblue')
            plt.gca().invert_yaxis()
            plt.xlabel(f'{m} (normalized)' if normalize.value else m)
            plt.title('Top results')
            plt.tight_layout()
            plt.show()

            # Inline audio players (aligned to target duration)
            # Robustly select a valid target path to avoid deprecated audioread fallback
            _target_candidate = None
            if 'target_path' in base_df.columns and len(base_df['target_path']) > 0:
                _cand = str(base_df['target_path'].iloc[0])
                if isinstance(_cand, str) and _cand.strip() and Path(_cand).is_file():
                    _target_candidate = Path(_cand)
            if _target_candidate is None:
                _target_candidate = Path('rendered_audio/additive_from_cello_single_2.0s_20250906-215542.wav')
            target, _ = librosa.load(str(_target_candidate), sr=SR, mono=True)
            display(W.HTML('<b>Audio (in current order):</b>'))
            for _, r in df.iterrows():
                try:
                    y, _ = librosa.load(r['path'], sr=SR, mono=True)
                    _, y_al = align_signals(target, y, MAX_ALIGN_SEC)
                    y_al = np.nan_to_num(y_al, nan=0.0, posinf=0.0, neginf=0.0)
                    display(W.HTML(f"<div style='margin:6px 0'><b>{r['title']}</b> &nbsp; <small>{r['path']}</small></div>"))
                    display(Audio(y_al, rate=SR, normalize=False))
                except Exception as ex:
                    display(W.HTML(f"<div style='color:#b00'>Audio load failed: {r['path']} ({ex})</div>"))

    for w in [metric_dd, sort_dir, filter_text, top_k, normalize]:
        w.observe(render, names='value')

    display(ui, out)
    render()



VBox(children=(HBox(children=(Dropdown(description='Metric:', options=('composite', 'time_mse', 'cosine_logmag…

Output()