In [None]:
import import_ipynb
import numpy as np
import librosa
import os
import re
from scipy.linalg import norm
import soundfile as sf

In [None]:
voice_dir= './speech/'
noise_dir= './noise/'
sample_rate= 8000
frame_length= 8064
hop_length_frame= 8064
n_fft= 255
hop_length= 63
SNR= 0

In [None]:
list_noise_files = os.listdir(noise_dir)
list_voice_files = os.listdir(voice_dir)
nb_noise_files = len(list_noise_files)
nb_noise_files = len(list_voice_files)
nb_noise_files, nb_noise_files

In [None]:
def audio_to_audio_frame_stack(audio, frame_length, hop_length):
    total_samples = len(audio)
    frames = []
    # Slide over the audio with the given hop length
    for start in range(0, total_samples - frame_length + 1, hop_length):
        frame = audio[start:start + frame_length]
        frames.append(frame)
    # Stack all frames vertically into a 2D array
    frame_array = np.vstack(frames)
    return frame_array

def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame):
    list_sound_array = []
    for file in list_audio_files:
        # open the audio file
        y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
        list_sound_array.append(audio_to_audio_frame_stack(y, frame_length, hop_length_frame))
    return np.vstack(list_sound_array)

def mixed_voice_with_noise(voice, noise, nb_samples, frame_length, SNR):
    prod_voice = np.zeros((nb_samples, frame_length))
    prod_noise = np.zeros((nb_samples, frame_length))
    prod_noisy_voice = np.zeros((nb_samples, frame_length))

    for i in range(nb_samples):
        prod_voice[i, :] = voice[i, :]
        prod_noise[i, :] = noise[i, :]/norm(noise[i, :])*10**(-SNR/20)*norm(voice[i, :]);
        prod_noisy_voice[i, :] = prod_voice[i, :] + prod_noise[i, :]
    return prod_voice, prod_noise, prod_noisy_voice

def calculate_stft(n_fft, hop_length_fft, audio):
    stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
    mag, phase= librosa.magphase(stftaudio)
    return mag, phase

def extract_stft_features(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
    nb_audio = numpy_audio.shape[0]
    mag = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
    phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
    for i in range(nb_audio):
        mag[i, :, :], phase[i, :, :] = calculate_stft(n_fft, hop_length_fft, numpy_audio[i])
    return mag, phase

In [None]:
# Extracting noise and voice from folder and convert to numpy
noise = audio_files_to_numpy(noise_dir, list_noise_files, sample_rate, frame_length, hop_length_frame)
voice = audio_files_to_numpy(voice_dir, list_voice_files, sample_rate, frame_length, hop_length_frame)
voice.shape, noise.shape

In [None]:
l = np.min([voice.shape[0], noise.shape[0]])
voice= voice[0:l]
noise= noise[0:l]
voice.shape, noise.shape

In [None]:
# Squared spectrogram dimensions
dim_square_spec = int(n_fft / 2) + 1 
dim_square_spec

In [None]:
os.makedirs('sound')
nb_samples= voice.shape[0]
nb_samples

In [None]:
prod_voice, prod_noise, prod_noisy = mixed_voice_with_noise(voice, noise, nb_samples, frame_length, SNR)
del voice, noise
voice= prod_voice.reshape(1, nb_samples*frame_length)
noise= prod_noise.reshape(1, nb_samples*frame_length)
noisy= prod_noisy.reshape(1, nb_samples*frame_length)

sf.write('./sound/voice.wav', voice[0, :], samplerate= sample_rate)
sf.write('./sound/noise.wav', noise[0, :], samplerate= sample_rate)
sf.write('./sound/noisy.wav', noisy[0, :], samplerate= sample_rate)

In [None]:
voice_mag, voice_phase= extract_stft_features(prod_voice, dim_square_spec, n_fft, hop_length)
del prod_voice
noise_mag, noise_phase= extract_stft_features(prod_noise, dim_square_spec, n_fft, hop_length)
del prod_noise
noisy_mag, noisy_phase= extract_stft_features(prod_noisy, dim_square_spec, n_fft, hop_length)
del prod_noisy
os.makedirs('spectrogram_data')
voice_mag.shape, voice_phase.shape, noise_mag.shape, noise_phase.shape, noisy_mag.shape, noisy_phase.shape

In [None]:
voice_mag = voice_mag.reshape(voice_mag.shape[0],voice_mag.shape[1],voice_mag.shape[2],1)
voice_phase = voice_phase.reshape(voice_phase.shape[0],voice_phase.shape[1],voice_phase.shape[2],1)

noise_mag = noise_mag.reshape(noise_mag.shape[0],noise_mag.shape[1],noise_mag.shape[2],1)
noise_phase = noise_phase.reshape(noise_phase.shape[0],noise_phase.shape[1],noise_phase.shape[2],1)

noisy_mag = noisy_mag.reshape(noisy_mag.shape[0],noisy_mag.shape[1],noisy_mag.shape[2],1)
noisy_phase = noisy_phase.reshape(noisy_phase.shape[0],noisy_phase.shape[1],noisy_phase.shape[2],1)
voice_mag.shape, voice_phase.shape, noise_mag.shape, noise_phase.shape, noisy_mag.shape, noisy_phase.shape

In [None]:
np.save('./spectrogram_data/voice_mag', voice_mag)
np.save('./spectrogram_data/voice_phase', voice_phase)
np.save('./spectrogram_data/noise_mag', noise_mag)
np.save('./spectrogram_data/noise_phase', noise_phase)
np.save('./spectrogram_data/noisy_mag', noisy_mag)
np.save('./spectrogram_data/noisy_phase', noisy_phase)