In [9]:
import numpy as np
import librosa
!pip install mir_eval
import mir_eval
from scipy.io import wavfile
import os



# Funcție pentru a alinia lungimile semnalelor audio
def match_length(reference, estimated):
    max_length = max(reference.shape[-1], estimated.shape[-1])
    if reference.shape[-1] < max_length:
        reference = np.pad(reference, [(0, 0), (0, max_length - reference.shape[-1])], mode='constant')
    if estimated.shape[-1] < max_length:
        estimated = np.pad(estimated, [(0, 0), (0, max_length - estimated.shape[-1])], mode='constant')
    return reference, estimated

# Încărcarea datelor
file_path = '/content/Kid CuDi Day N Nite.wav'
y, sr = librosa.load(file_path, duration=120)

# Setări pentru experimente
fft_sizes = [512, 1024, 2048]
overlaps = [0.25, 0.5]
margin_settings = [(1, 5), (2, 10), (3, 15)]
powers = [1, 2, 3]
results = []

# Bucle pentru fiecare combinație de setări
for fft_size in fft_sizes:
    for overlap in overlaps:
        for margins in margin_settings:
            for power in powers:
                hop_length = int(fft_size * overlap)
                S_full, phase = librosa.magphase(librosa.stft(y, n_fft=fft_size, hop_length=hop_length))
                margin_i, margin_v = margins

                # Filtrarea mediană și crearea măștilor
                S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=sr)))
                S_filter = np.minimum(S_full, S_filter)
                mask_i = librosa.util.softmask(S_filter, margin_i * (S_full - S_filter), power=power)
                mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power)

                # Separarea surselor
                S_background = mask_i * S_full
                S_foreground = mask_v * S_full
                y_foreground = librosa.istft(S_foreground * phase)
                y_background = librosa.istft(S_background * phase)

                # Alinierea și evaluarea
                reference_audio, estimated_audio = match_length(np.array([y, y]), np.array([y_foreground, y_background]))
                sdr, sir, sar, _ = mir_eval.separation.bss_eval_sources(reference_audio, estimated_audio)
                results.append((fft_size, overlap, margins, power, sdr.mean(), sir.mean(), sar.mean()))

# Afișarea rezultatelor
for result in results:
    print(f"FFT Size: {result[0]}, Overlap: {result[1]}, Margins: {result[2]}, Power: {result[3]}, SDR: {result[4]}, SIR: {result[5]}, SAR: {result[6]}")

FFT Size: 512, Overlap: 0.25, Margins: (1, 5), Power: 1, SDR: 7.517081142193663, SIR: 159.1357147405071, SAR: 7.517081142102157
FFT Size: 512, Overlap: 0.25, Margins: (1, 5), Power: 2, SDR: 5.089575070360893, SIR: 156.45727868957957, SAR: 5.089575070206778
FFT Size: 512, Overlap: 0.25, Margins: (1, 5), Power: 3, SDR: 4.146672289425023, SIR: 155.73317585900625, SAR: 4.1466722892961
FFT Size: 512, Overlap: 0.25, Margins: (2, 10), Power: 1, SDR: 5.581634614183726, SIR: 157.82589850667898, SAR: 5.581634614094224
FFT Size: 512, Overlap: 0.25, Margins: (2, 10), Power: 2, SDR: 2.831391119514312, SIR: 154.9797636975466, SAR: 2.831391119549701
FFT Size: 512, Overlap: 0.25, Margins: (2, 10), Power: 3, SDR: 1.947126875036198, SIR: 154.29506540706745, SAR: 1.9471268752750674
FFT Size: 512, Overlap: 0.25, Margins: (3, 15), Power: 1, SDR: 4.451633688435646, SIR: 157.20416500679153, SAR: 4.451633688366622
FFT Size: 512, Overlap: 0.25, Margins: (3, 15), Power: 2, SDR: 1.5062816101724046, SIR: 154.4062