To Preform evaluation on speech using computational evaluation tools such as PESQ, STOI and SDR both the reference (clean audio) and enhanced speech (mixture through models) are required. The enhanced speech is created with this program 

What is required:
- Provide the CNN classification model from "class_model.ipynb"
- Provide the GAN waveform generator from ""
- Before performing step 4 is in the "path_eval" provide the clean and noisy audio files and the program will enhance the speech and evaluate through all iterations.
- The name of the audio must be called speech_"number".wav the same as the create_dataset convention (With the text inside "" being changed to a number)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import tensorflow as tf
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
import scipy.signal as sps
import soundfile as sf
import os
import math
import wave
from scipy.io import wavfile
from keras.models import load_model
from keras.models import Sequential
import xlsxwriter
import openpyxl

from pesq import pesq, NoUtterancesError
from pystoi import stoi
import mir_eval

import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

%matplotlib inline
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print(physical_devices)
if physical_devices:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)

2.0.1+cpu
2.0.2+cpu
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
def create_spectrogram(audio_file, image_file):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)

    y, sr = librosa.load(audio_file)
    ms = librosa.feature.melspectrogram(y=y, sr=sr, S=None, n_fft=1024, hop_length=80, win_length=320, window='hann', center=True, pad_mode='constant', power=2.0)
    #ms = librosa.feature.melspectrogram(y=y, sr=sr, S=None)
    log_ms = librosa.power_to_db(ms, ref=np.max)
    librosa.display.specshow(log_ms, sr=sr)

    fig.savefig(image_file)
    plt.close(fig)

Step 1) Loading CNN classification and GAN models

In [3]:
# Loading the CNN classification model
# Recreate the exact same model, including its weights and the optimizer
cnn_model = tf.keras.models.load_model('../Models/Classification-Models/my_model.h5')

# Show the model architecture
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 222, 222, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 111, 111, 32)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 109, 109, 128)     36992     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 54, 54, 128)      0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 52, 52, 128)       147584    
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 26, 26, 128)      0

In [4]:
# Load the noise GAN model
Noise_gan = tf.saved_model.load('../Models/GAN-Models/Full_Curriculum_4_generator')

In [5]:
# GAN noise subtraction
def hann_window_a_signal(Windowed_data):
    Hann_window = sps.windows.hann(len(Windowed_data))
    Hann_Windowed_data = Hann_window * Windowed_data
    padded_signal = np.pad(Hann_Windowed_data, (0, 512), 'constant')
    Windowed_data_fft = np.fft.fft(padded_signal, 1024)
    return Windowed_data_fft

def perform_gan_noise_subtraction(input_audio_file, output_audio_file, alpha, Noise_gan):
    N_fft = 1024

    samplerate, data = wavfile.read(input_audio_file)
    Bit_Check = wave.open(input_audio_file, 'rb')
    bit_depth = Bit_Check.getsampwidth() * 8
    data = data / (2 ** (bit_depth - 1))
    Overlaps = math.floor(len(data) / 128)
    audio_ss = np.zeros(len(data))

    for No_of_overlaps in range(Overlaps - 5):
        Rectangular_windowed_signal = data[0 + 128 * No_of_overlaps:512 + 128 * No_of_overlaps]
        GAN_noise_estimate = np.zeros(N_fft)
        FFT_of_windowed_signal = hann_window_a_signal(Rectangular_windowed_signal)

        Hann_window = sps.windows.hann(len(Rectangular_windowed_signal))
        PSD_window_scaling = np.sum(Hann_window ** 2)
        PSD_of_windowed_signal = (np.abs(FFT_of_windowed_signal) ** 2) / (samplerate * PSD_window_scaling)

        Tensor_PSD = tf.convert_to_tensor(PSD_of_windowed_signal.reshape(1, 1024), tf.float32)
        Generated_codebook = Noise_gan(Tensor_PSD)
        Generated_codebook = Generated_codebook.numpy()
        Generated_codebook_reshaped = np.abs((Generated_codebook.reshape(1024, 9)))

        Generated_codebook_inverse = np.linalg.pinv(Generated_codebook_reshaped, rcond=1e-15)
        Generated_coeffs = Generated_codebook_inverse * PSD_of_windowed_signal
        Generated_coeffs = np.transpose(Generated_coeffs)
        GAN_noise_codebook = (Generated_coeffs * Generated_codebook_reshaped)
        GAN_noise_codebook = GAN_noise_codebook.clip(min=0)

        for Freq_bin in range(0, N_fft):
            GAN_noise_estimate[Freq_bin] = np.sum(GAN_noise_codebook[Freq_bin, :])

        GAN_noise_estimate[512:1024] = np.flip(GAN_noise_estimate[0:512])

        snr = PSD_of_windowed_signal / (100 * GAN_noise_estimate)
        Spectral_mask = 1 - np.minimum(1, np.maximum(0, alpha * snr))
        Clean_signal = FFT_of_windowed_signal * Spectral_mask
        Clean_frames = np.fft.ifft(Clean_signal)

        audio_ss[0 + 128 * No_of_overlaps:512 + 128 * No_of_overlaps] = audio_ss[
                                                                           0 + 128 * No_of_overlaps:512 + 128 * No_of_overlaps] + Clean_frames[0:512]

    # Save the processed audio
    sf.write(output_audio_file, audio_ss, samplerate, 'PCM_16')

Step 2) Function using compuatational evaluation tools such as PESQ, STOI and SDR

https://pytorch.org/audio/0.13.1/tutorials/mvdr_tutorial.html

SDR means estimated speech contains more desired source signal and less undesired sources or noise (HIGHER FOR BETTER SPEECH QUALITY)

The STOI score is a value between 0 and 1, where 0 represents no intelligibility (completely unintelligible) and 1 represents perfect intelligibility (no degradation, the speech is fully understandable).

In the PESQ scale, the scores generally range from approximately -0.5 to 4.5, with specific meanings attributed to different score ranges: Scores above 4.0: Excellent quality. The degraded speech is very close to the reference, and the quality is perceived as excellent.


In [8]:
SAMPLE_RATE = 16000

def si_snr(estimate, reference, epsilon=1e-8):
    estimate = estimate - estimate.mean()
    reference = reference - reference.mean()
    reference_pow = reference.pow(2).mean(axis=1, keepdim=True)
    mix_pow = (estimate * reference).mean(axis=1, keepdim=True)
    scale = mix_pow / (reference_pow + epsilon)

    reference = scale * reference
    error = estimate - reference

    reference_pow = reference.pow(2)
    error_pow = error.pow(2)

    reference_pow = reference_pow.mean(axis=1)
    error_pow = error_pow.mean(axis=1)

    si_snr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
    return si_snr.item()

def evaluate(estimate, reference):
    try:
        si_snr_score = si_snr(estimate, reference)
        (
            sdr,
            _,
            _,
            _,
        ) = mir_eval.separation.bss_eval_sources(reference.numpy(), estimate.numpy(), False)
        pesq_mix = pesq(SAMPLE_RATE, estimate[0].numpy(), reference[0].numpy(), "wb")
        stoi_mix = stoi(reference[0].numpy(), estimate[0].numpy(), SAMPLE_RATE, extended=False)
        return pesq_mix, stoi_mix, sdr[0], si_snr_score
    except NoUtterancesError as e:
        return 0, 0, 0, 0  # Or any default values you prefer

Step 3) Create directories to save the evaluation preformed

In [20]:
# Creating Directory to save evaluation
path_eval = "Evaluation1"

isExist = os.path.exists(path_eval)
if not isExist:
    # Create a new directory because it does not exist
    os.makedirs(path_eval)
    os.makedirs(path_eval + "/clean")
    os.makedirs(path_eval + "/noisy")
    os.makedirs(path_eval + "/enhanced")
    print("the %s directory is created!", path_eval)

In [21]:
# Creating evaluation Excel file
path_excel = path_eval + "/evaluation.xlsx"

isExist3 = os.path.exists(path_excel)
if not isExist3:
   workbook1 = xlsxwriter.Workbook(path_excel)
   worksheet1 = workbook1.add_worksheet()
   worksheet1.write(0, 0, "Number")
   worksheet1.write(0, 1, "Name")
   worksheet1.write(0, 2, "Original SNR(dB)")
   worksheet1.write(0, 3, "Alpha Value")
   worksheet1.write(0, 4, "SNR(dB)")
   worksheet1.write(0, 5, "PESQ Evaluation")
   worksheet1.write(0, 6, "STOI Evaluation")
   worksheet1.write(0, 7, "SDR Evaluation")
   worksheet1.write(0, 8, "SI-SDR Evaluation")
   worksheet1.write(0, 9, "PESQ Highest")
   worksheet1.write(0, 10, "STOI Highest")
   print("The excel file 1 is created!")
   workbook1.close()

The excel file 1 is created!


Step 4) Using the clean and noisy audio to perform speech enhancement with evaluation

- Make sure to include the noisy and clean audio in the "path_eval" folder

In [22]:
# Load the clean audio folders
clean_path = path_eval + "/clean"
noisy_path = path_eval + "/noisy"
enhanced_path = path_eval + "/enhanced"

In [25]:
# For all noisy files
entries = os.listdir(noisy_path)
print("Amount of noisy folders to enhance and evaluate: " + str(len(entries)))

noisy = ["-3dB", "-6dB", "-9dB", "0dB"] # Considered noisy classes
clean = ["3dB", "6dB", "9dB", "clean"] # Considered clean classes
target_names = ["clean", "0dB", "-3dB", "-6dB", "-9dB", "3dB", "6dB", "9dB"] # All classes

Amount of noisy folders to enhance and evaluate: 2


In [24]:
i = 0
for ent in entries:
    edit_ent = ent.replace(".wav", "")
    # Create a new directory at each noisy speech file
    os.makedirs(enhanced_path + "/{}/".format(edit_ent) + "spectrogram/", exist_ok=True)
    os.makedirs(enhanced_path + "/{}/".format(edit_ent) + "wav/", exist_ok=True)
    new_ent_img = enhanced_path + "/{}/".format(edit_ent) + "spectrogram/"
    new_ent_wav = enhanced_path + "/{}/".format(edit_ent) + "wav/"
    create_spectrogram(noisy_path + "/" + ent, new_ent_img + 'noisy.png')
    create_spectrogram(clean_path + "/" + ent, new_ent_img + 'clean.png')
    current_image = new_ent_img + 'noisy.png' # Orignal noisy spectrogram image
    noisy_wav = noisy_path + "/" + ent # Original noisy wav file
    clean_wav = clean_path + "/" + ent # Original clean wav file
    audio_clean = False
    stop_value = 2 # Maximum alpha value
    start_value = 0 # Minimum alpha value
    step_value = 0.05 # Step size for alpha value
    counter = 0 # Counter for number of iterations

    STOI_Highest = 0
    PESQ_Highest = 0
    
    while(audio_clean == False and start_value < stop_value):
        counter += 1
        # Preprocess the image
        image = img_to_array(load_img(current_image, target_size=(224, 224, 3)))
        image = image / 255.0
        image = np.expand_dims(image, axis=0)

        # Make predictions
        predictions = cnn_model.predict(image)
        # Find the index of the maximum prediction
        predicted_index = np.argmax(predictions)
        # Check if the predicted class matches the target
        predicted_class = target_names[predicted_index]

        # Save the enhanced audio and spectrogram
        start_value = round(start_value, 2)
        print("{}: Alpha value: {} Class: {}".format(edit_ent, "{:.2f}".format(start_value), predicted_class), end="\r")
        output_audio = new_ent_wav + 'Alpha{}.wav'.format("{:.2f}".format(start_value))
        perform_gan_noise_subtraction(noisy_wav, output_audio, start_value, Noise_gan)
        create_spectrogram(output_audio, new_ent_img + 'Alpha{}.png'.format("{:.2f}".format(start_value)))
        current_image = new_ent_img + 'Alpha{}.png'.format("{:.2f}".format(start_value))
        
        # Evaluation
        SAMPLE_CLEAN = clean_wav
        SAMPLE_NOISY = output_audio
        waveform_clean, sr = torchaudio.load(SAMPLE_CLEAN)
        waveform_noisy, sr2 = torchaudio.load(SAMPLE_NOISY)

        PESQ, STOI, SDR, SI_SNR_SCORE = evaluate(waveform_noisy[0:1], waveform_clean[0:1])

        # Saving the best evaluation
        if STOI > STOI_Highest:
            STOI_Highest = STOI
            STOI_saved = i 
        if PESQ > PESQ_Highest:
            PESQ_Highest = PESQ
            PESQ_saved = i

        # Adding the evaluation information to the excel file
        workfile1 = openpyxl.load_workbook(path_excel)
        sheet1 = workfile1.active

        sheet1.cell(row=i+2, column=1).value = counter
        sheet1.cell(row=i+2, column=2).value = edit_ent
        sheet1.cell(row=i+2, column=3).value = "N/A"
        sheet1.cell(row=i+2, column=4).value = start_value
        sheet1.cell(row=i+2, column=5).value = predicted_class
        sheet1.cell(row=i+2, column=6).value = PESQ
        sheet1.cell(row=i+2, column=7).value = STOI
        sheet1.cell(row=i+2, column=8).value = SDR
        sheet1.cell(row=i+2, column=9).value = SI_SNR_SCORE
        workfile1.save(path_excel)
        i += 1
        
        # Check if the audio is clean
        if predicted_class in clean:
            audio_clean = True
        else: 
            start_value += step_value

    workfile1 = openpyxl.load_workbook(path_excel)
    sheet1 = workfile1.active

    sheet1.cell(row=PESQ_saved+2, column=10).value = PESQ_Highest
    sheet1.cell(row=STOI_saved+2, column=11).value = STOI_Highest

    workfile1.save(path_excel)

    i += 1

    # Print the evaluation information
    print("{}: Current SNR: {}, Alpha value: {}, Class: {}, PESQ: {}, STOI: {}".format(edit_ent, "N/A", "{:.2f}".format(start_value), predicted_class, PESQ, STOI))

speech_1: Alpha value: 0.00 Class: -3dB

  audio_ss[0 + 128 * No_of_overlaps:512 + 128 * No_of_overlaps] = audio_ss[


speech_1: Current SNR: Random, Alpha value: 2.00, Class: -3dB, PESQ: 1.130240797996521, STOI: 0.5481476040184929
speech_2: Current SNR: Random, Alpha value: 0.00, Class: 9dB, PESQ: 1.0515708923339844, STOI: 0.6186998364983282
