In [9]:
import os
import gc
import scipy
import glob
import math
import torch
import pickle
import librosa
import warnings
import datetime
import numpy as np
import pandas as pd
import multiprocessing
from pathlib import Path
from utils import read_audio
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from sklearn.utils import shuffle
import random
#from feature_extractor import FeatureExtractor
#from utils import prepare_input_features
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import util 
warnings.filterwarnings(action='ignore')

In [10]:
def load_variables(path):
    with open(path, 'rb') as archivo:
        noise_files = pickle.load(archivo)
    return noise_files

In [11]:
noise_source_files = load_variables('variables/NOISES_STATIONARY_NOISE_S.pkl')
noise_target_files = load_variables('variables/NOISES_NO_STATIONARY_NOISE_T.pkl')
speakers_files = load_variables('variables/SPEAKERS_GAN.pkl')

In [12]:
np.random.seed(999)

In [13]:
speakers_audio = []
for speakers_file in speakers_files:
    speakers_audio.extend(speakers_file)
print(speakers_audio)

['data/Speakers/train-clean-100/103/103-1240-0000.flac', 'data/Speakers/train-clean-100/103/103-1240-0001.flac', 'data/Speakers/train-clean-100/103/103-1240-0002.flac', 'data/Speakers/train-clean-100/103/103-1240-0003.flac', 'data/Speakers/train-clean-100/103/103-1240-0004.flac', 'data/Speakers/train-clean-100/103/103-1240-0005.flac', 'data/Speakers/train-clean-100/103/103-1240-0006.flac', 'data/Speakers/train-clean-100/103/103-1240-0007.flac', 'data/Speakers/train-clean-100/103/103-1240-0008.flac', 'data/Speakers/train-clean-100/103/103-1240-0009.flac', 'data/Speakers/train-clean-100/103/103-1240-0010.flac', 'data/Speakers/train-clean-100/103/103-1240-0011.flac', 'data/Speakers/train-clean-100/103/103-1240-0012.flac', 'data/Speakers/train-clean-100/103/103-1240-0013.flac', 'data/Speakers/train-clean-100/103/103-1240-0014.flac', 'data/Speakers/train-clean-100/103/103-1240-0015.flac', 'data/Speakers/train-clean-100/103/103-1240-0016.flac', 'data/Speakers/train-clean-100/103/103-1240-001

In [14]:
np.random.shuffle(speakers_audio)
print(speakers_audio)

['data/Speakers/train-clean-100/1334/1334-135589-0015.flac', 'data/Speakers/train-clean-100/163/163-122947-0077.flac', 'data/Speakers/train-clean-100/1235/1235-135884-0021.flac', 'data/Speakers/train-clean-100/1235/1235-135883-0024.flac', 'data/Speakers/train-clean-100/1502/1502-122619-0058.flac', 'data/Speakers/train-clean-100/1069/1069-133699-0011.flac', 'data/Speakers/train-clean-100/150/150-126112-0009.flac', 'data/Speakers/train-clean-100/1447/1447-130551-0027.flac', 'data/Speakers/train-clean-100/125/125-121124-0003.flac', 'data/Speakers/train-clean-100/1069/1069-133699-0000.flac', 'data/Speakers/train-clean-100/1455/1455-134435-0028.flac', 'data/Speakers/train-clean-100/1447/1447-130551-0020.flac', 'data/Speakers/train-clean-100/1447/1447-130552-0023.flac', 'data/Speakers/train-clean-100/163/163-122947-0057.flac', 'data/Speakers/train-clean-100/1553/1553-140047-0024.flac', 'data/Speakers/train-clean-100/1069/1069-133699-0042.flac', 'data/Speakers/train-clean-100/1455/1455-134435

In [15]:
#get sets of data
def get_list_of_sets(data_list):
    # lista con las 1500 muestras
    data = data_list
    # número de muestras para cada conjunto
    train_size = int(0.8 * len(data))
    val_size = int(0.1 * len(data))
    test_size = len(data) - train_size - val_size
    # separar las muestras en conjuntos
    train_data = data[:train_size]
    val_data = data[train_size:train_size+val_size]
    test_data = data[train_size+val_size:]
    print(f"Tamaño del conjunto de entrenamiento: {len(train_data)}")
    print(f"Tamaño del conjunto de validación: {len(val_data)}")
    print(f"Tamaño del conjunto de prueba: {len(test_data)}")
    return train_data, val_data, test_data

In [16]:
train_speaker, val_speaker, test_speaker = get_list_of_sets(speakers_audio)

Tamaño del conjunto de entrenamiento: 1600
Tamaño del conjunto de validación: 200
Tamaño del conjunto de prueba: 200


In [17]:
def make_spectrum(filename=None, y=None, feature_type='logmag', _max=None, _min=None):
    if y is not None:
        y = y
    else:
        y, sr = librosa.load(filename, sr=16000)
        if sr != 16000:
            raise ValueError('Sampling rate is expected to be 16kHz!')
        if y.dtype == 'int16':
            y = np.float32(y/32767.)
        elif y.dtype !='float32':
            y = np.float32(y)

    D = librosa.stft(y, center=False, n_fft=512, hop_length=160, win_length=512, window=scipy.signal.hamming)
    phase = np.exp(1j * np.angle(D))
    D = np.abs(D)

    # select feature types
    if feature_type == 'logmag':
        Sxx = np.log1p(D)
    elif feature_type == 'lps':
        Sxx = np.log10(D**2)
    else:
        Sxx = D

    return Sxx, phase, len(y)

In [18]:
CLEAN_PATH_TRAINING_S = 'data/PT_FILES_S/CLEAN/TRAINING/'
Path(CLEAN_PATH_TRAINING_S).mkdir(parents=True, exist_ok=True)
NOISE_PATH_TRAINING_S = 'data/PT_FILES_S/NOISE/TRAINING/'
Path(NOISE_PATH_TRAINING_S).mkdir(parents=True, exist_ok=True)
CLEAN_PATH_TESTING_S = 'data/PT_FILES_S/CLEAN/TESTING/'
Path(CLEAN_PATH_TESTING_S).mkdir(parents=True, exist_ok=True)
NOISE_PATH_TESTING_S = 'data/PT_FILES_S/NOISE/TESTING/'
Path(NOISE_PATH_TESTING_S).mkdir(parents=True, exist_ok=True)
CLEAN_PATH_VALIDATION_S = 'data/PT_FILES_S/CLEAN/VALIDATION/'
Path(CLEAN_PATH_VALIDATION_S).mkdir(parents=True, exist_ok=True)
NOISE_PATH_VALIDATION_S = 'data/PT_FILES_S/NOISE/VALIDATION/'
Path(NOISE_PATH_VALIDATION_S).mkdir(parents=True, exist_ok=True)

In [19]:
CLEAN_PATH_TRAINING_T = 'data/PT_FILES_T/CLEAN/TRAINING/'
Path(CLEAN_PATH_TRAINING_T).mkdir(parents=True, exist_ok=True)
NOISE_PATH_TRAINING_T = 'data/PT_FILES_T/NOISE/TRAINING/'
Path(NOISE_PATH_TRAINING_T).mkdir(parents=True, exist_ok=True)
CLEAN_PATH_TESTING_T = 'data/PT_FILES_T/CLEAN/TESTING/'
Path(CLEAN_PATH_TESTING_T).mkdir(parents=True, exist_ok=True)
NOISE_PATH_TESTING_T = 'data/PT_FILES_T/NOISE/TESTING/'
Path(NOISE_PATH_TESTING_T).mkdir(parents=True, exist_ok=True)
CLEAN_PATH_VALIDATION_T = 'data/PT_FILES_T/CLEAN/VALIDATION/'
Path(CLEAN_PATH_VALIDATION_T).mkdir(parents=True, exist_ok=True)
NOISE_PATH_VALIDATION_T = 'data/PT_FILES_T/NOISE/VALIDATION/'
Path(NOISE_PATH_VALIDATION_T).mkdir(parents=True, exist_ok=True)

In [20]:
noise_target_files

['data/NOISE/stationary/pink.wav',
 'data/NOISE/stationary/dripping_water.wav',
 'data/NOISE/stationary/car.wav',
 'data/NOISE/stationary/cabin.wav',
 'data/NOISE/stationary/rain.wav',
 'data/NOISE/stationary/wind.wav',
 'data/NOISE/stationary/typing.wav']

In [21]:
noise_source_files

['data/NOISE/nonstationary/babycry.wav',
 'data/NOISE/nonstationary/crowd_party_adult_med.wav',
 'data/NOISE/nonstationary/bell_church.wav',
 'data/NOISE/nonstationary/cafeteria_babble.wav',
 'data/NOISE/nonstationary/helicopter.wav',
 'data/NOISE/nonstationary/people.wav',
 'data/NOISE/nonstationary/dog_bark.wav']

In [22]:
SELECTED_NOISE_S = noise_source_files
print(SELECTED_NOISE_S)

['data/NOISE/nonstationary/babycry.wav', 'data/NOISE/nonstationary/crowd_party_adult_med.wav', 'data/NOISE/nonstationary/bell_church.wav', 'data/NOISE/nonstationary/cafeteria_babble.wav', 'data/NOISE/nonstationary/helicopter.wav', 'data/NOISE/nonstationary/people.wav', 'data/NOISE/nonstationary/dog_bark.wav']


In [23]:
SELECTED_NOISE_T = noise_target_files
print(SELECTED_NOISE_T)

['data/NOISE/stationary/pink.wav', 'data/NOISE/stationary/dripping_water.wav', 'data/NOISE/stationary/car.wav', 'data/NOISE/stationary/cabin.wav', 'data/NOISE/stationary/rain.wav', 'data/NOISE/stationary/wind.wav', 'data/NOISE/stationary/typing.wav']


In [24]:
def add_noise_to_clean_audio(clean_audio, noise_signal, decibel):
    if len(clean_audio) >= len(noise_signal):
        # print("The noisy signal is smaller than the clean audio input. Duplicating the noise.")
        while len(clean_audio) >= len(noise_signal):
            noise_signal = np.append(noise_signal, noise_signal)
    ## Extract a noise segment from a random location in the noise file
    ind = np.random.randint(0, noise_signal.size - clean_audio.size)
    noiseSegment = noise_signal[ind: ind + clean_audio.size]
    speech_power = np.var(clean_audio)
    noise = noiseSegment - np.mean(noiseSegment)
    n_var = speech_power / (10**(decibel / 10.))
    noise = np.sqrt(n_var) * noiseSegment / np.std(noiseSegment)
    noisyAudio = clean_audio + noise
    return noisyAudio

In [25]:
def pt_files(index, data, n_frame, path):
    for j in np.arange(data.shape[1] // n_frame):
        out_name = path + str(index).zfill(6) + '_' + str(j).zfill(6) + '.pt'
        torch.save(torch.from_numpy(data.transpose()[j * n_frame:(j + 1) * n_frame]), out_name)

In [28]:
def make_spectrum_noise_clean(FILES_SPEAKER, NOISE, PATH_NOISE, PATH_CLEAN, NAME):
    n_frame = 64
    SNR = [-9, -6, -3, 0, 3, 6, 9]
    list_snr_noise = []
    for i in range(len(FILES_SPEAKER)):
        wav_speech_clean, sr_wav_clean = librosa.load(FILES_SPEAKER[i], sr=16000)
        #TODO:1 Here put a random selection of noise
        int_noise = random.randint(0,len(NOISE)-1)
        int_snr = random.randint(0,len(SNR)-1)
        list_snr_noise.append([NOISE[int_noise], SNR[int_snr]])
        wav_noise, sr_wav_noise = librosa.load(NOISE[int_noise], sr=16000)
        wav_speech_with_noise = add_noise_to_clean_audio(wav_speech_clean, wav_noise, SNR[int_snr])
        data_audio_clean, _, _ = make_spectrum(y=wav_speech_clean)
        data_audio_noise, _, _ = make_spectrum(y=wav_speech_with_noise)
        #CLEAN
        pt_files(i, data_audio_clean, n_frame, PATH_CLEAN)
        #NOISE
        pt_files(i, data_audio_noise, n_frame, PATH_NOISE)
    util.generate_pkl('stastics/', list_snr_noise, 'LIST_SNR_NOISE'+str(NAME))

In [29]:
#Training
make_spectrum_noise_clean(train_speaker, SELECTED_NOISE_S, NOISE_PATH_TRAINING_S, CLEAN_PATH_TRAINING_S, 'train_s')

[['data/NOISE/nonstationary/people.wav', -3], ['data/NOISE/nonstationary/helicopter.wav', -3], ['data/NOISE/nonstationary/helicopter.wav', 0], ['data/NOISE/nonstationary/dog_bark.wav', -9], ['data/NOISE/nonstationary/helicopter.wav', -9], ['data/NOISE/nonstationary/people.wav', -3], ['data/NOISE/nonstationary/crowd_party_adult_med.wav', 0], ['data/NOISE/nonstationary/babycry.wav', -3], ['data/NOISE/nonstationary/cafeteria_babble.wav', -9], ['data/NOISE/nonstationary/people.wav', -9], ['data/NOISE/nonstationary/crowd_party_adult_med.wav', -6], ['data/NOISE/nonstationary/helicopter.wav', 0], ['data/NOISE/nonstationary/babycry.wav', 0], ['data/NOISE/nonstationary/crowd_party_adult_med.wav', 0], ['data/NOISE/nonstationary/babycry.wav', 0], ['data/NOISE/nonstationary/babycry.wav', -3], ['data/NOISE/nonstationary/cafeteria_babble.wav', -6], ['data/NOISE/nonstationary/people.wav', 3], ['data/NOISE/nonstationary/helicopter.wav', -3], ['data/NOISE/nonstationary/dog_bark.wav', 9], ['data/NOISE/n

In [30]:
#Testing
make_spectrum_noise_clean(test_speaker, SELECTED_NOISE_S, NOISE_PATH_TESTING_S, CLEAN_PATH_TESTING_S, 'test_s')

[['data/NOISE/nonstationary/people.wav', -6], ['data/NOISE/nonstationary/helicopter.wav', 3], ['data/NOISE/nonstationary/dog_bark.wav', 9], ['data/NOISE/nonstationary/babycry.wav', -6], ['data/NOISE/nonstationary/helicopter.wav', -6], ['data/NOISE/nonstationary/dog_bark.wav', -9], ['data/NOISE/nonstationary/babycry.wav', 3], ['data/NOISE/nonstationary/helicopter.wav', 0], ['data/NOISE/nonstationary/dog_bark.wav', -9], ['data/NOISE/nonstationary/cafeteria_babble.wav', 9], ['data/NOISE/nonstationary/cafeteria_babble.wav', 3], ['data/NOISE/nonstationary/bell_church.wav', -9], ['data/NOISE/nonstationary/dog_bark.wav', -9], ['data/NOISE/nonstationary/people.wav', -9], ['data/NOISE/nonstationary/cafeteria_babble.wav', -6], ['data/NOISE/nonstationary/people.wav', -6], ['data/NOISE/nonstationary/dog_bark.wav', 0], ['data/NOISE/nonstationary/people.wav', -6], ['data/NOISE/nonstationary/babycry.wav', 6], ['data/NOISE/nonstationary/bell_church.wav', -6], ['data/NOISE/nonstationary/people.wav', -3

In [31]:
#Validation
make_spectrum_noise_clean(val_speaker, SELECTED_NOISE_S, NOISE_PATH_VALIDATION_S, CLEAN_PATH_VALIDATION_S, 'val_s')

[['data/NOISE/nonstationary/babycry.wav', 6], ['data/NOISE/nonstationary/bell_church.wav', 3], ['data/NOISE/nonstationary/people.wav', -9], ['data/NOISE/nonstationary/people.wav', 3], ['data/NOISE/nonstationary/helicopter.wav', -3], ['data/NOISE/nonstationary/cafeteria_babble.wav', 3], ['data/NOISE/nonstationary/babycry.wav', 0], ['data/NOISE/nonstationary/crowd_party_adult_med.wav', 3], ['data/NOISE/nonstationary/people.wav', -3], ['data/NOISE/nonstationary/people.wav', -9], ['data/NOISE/nonstationary/dog_bark.wav', 3], ['data/NOISE/nonstationary/crowd_party_adult_med.wav', -9], ['data/NOISE/nonstationary/babycry.wav', -3], ['data/NOISE/nonstationary/bell_church.wav', -9], ['data/NOISE/nonstationary/helicopter.wav', 6], ['data/NOISE/nonstationary/dog_bark.wav', 3], ['data/NOISE/nonstationary/cafeteria_babble.wav', 9], ['data/NOISE/nonstationary/crowd_party_adult_med.wav', -9], ['data/NOISE/nonstationary/dog_bark.wav', 6], ['data/NOISE/nonstationary/people.wav', -9], ['data/NOISE/nonst

In [32]:
#Training
make_spectrum_noise_clean(train_speaker, SELECTED_NOISE_T, NOISE_PATH_TRAINING_T, CLEAN_PATH_TRAINING_T, 'train_t')

[['data/NOISE/stationary/car.wav', 3], ['data/NOISE/stationary/pink.wav', 9], ['data/NOISE/stationary/pink.wav', 6], ['data/NOISE/stationary/dripping_water.wav', 6], ['data/NOISE/stationary/dripping_water.wav', 3], ['data/NOISE/stationary/rain.wav', 3], ['data/NOISE/stationary/cabin.wav', 6], ['data/NOISE/stationary/rain.wav', -6], ['data/NOISE/stationary/pink.wav', 3], ['data/NOISE/stationary/dripping_water.wav', 3], ['data/NOISE/stationary/typing.wav', 3], ['data/NOISE/stationary/wind.wav', 6], ['data/NOISE/stationary/rain.wav', -3], ['data/NOISE/stationary/rain.wav', -9], ['data/NOISE/stationary/wind.wav', -6], ['data/NOISE/stationary/typing.wav', -6], ['data/NOISE/stationary/car.wav', -9], ['data/NOISE/stationary/car.wav', 3], ['data/NOISE/stationary/pink.wav', -3], ['data/NOISE/stationary/rain.wav', 9], ['data/NOISE/stationary/wind.wav', -9], ['data/NOISE/stationary/cabin.wav', 3], ['data/NOISE/stationary/cabin.wav', -9], ['data/NOISE/stationary/cabin.wav', -6], ['data/NOISE/stati

In [33]:
#Testing
make_spectrum_noise_clean(test_speaker, SELECTED_NOISE_T, NOISE_PATH_TESTING_T, CLEAN_PATH_TESTING_T, 'test_t')

[['data/NOISE/stationary/car.wav', -9], ['data/NOISE/stationary/typing.wav', -6], ['data/NOISE/stationary/wind.wav', 9], ['data/NOISE/stationary/dripping_water.wav', 3], ['data/NOISE/stationary/rain.wav', -9], ['data/NOISE/stationary/wind.wav', -3], ['data/NOISE/stationary/cabin.wav', 6], ['data/NOISE/stationary/dripping_water.wav', 0], ['data/NOISE/stationary/dripping_water.wav', -3], ['data/NOISE/stationary/wind.wav', 3], ['data/NOISE/stationary/pink.wav', -9], ['data/NOISE/stationary/dripping_water.wav', -3], ['data/NOISE/stationary/wind.wav', 3], ['data/NOISE/stationary/wind.wav', -6], ['data/NOISE/stationary/dripping_water.wav', 6], ['data/NOISE/stationary/dripping_water.wav', -6], ['data/NOISE/stationary/car.wav', 9], ['data/NOISE/stationary/typing.wav', -3], ['data/NOISE/stationary/typing.wav', -9], ['data/NOISE/stationary/rain.wav', 3], ['data/NOISE/stationary/typing.wav', 6], ['data/NOISE/stationary/cabin.wav', 0], ['data/NOISE/stationary/typing.wav', 0], ['data/NOISE/stationa

In [34]:
#Validation
make_spectrum_noise_clean(val_speaker, SELECTED_NOISE_T, NOISE_PATH_VALIDATION_T, CLEAN_PATH_VALIDATION_T, 'val_t')

[['data/NOISE/stationary/cabin.wav', -9], ['data/NOISE/stationary/pink.wav', -9], ['data/NOISE/stationary/dripping_water.wav', 0], ['data/NOISE/stationary/cabin.wav', -6], ['data/NOISE/stationary/cabin.wav', 6], ['data/NOISE/stationary/car.wav', 9], ['data/NOISE/stationary/dripping_water.wav', 0], ['data/NOISE/stationary/wind.wav', 3], ['data/NOISE/stationary/wind.wav', 6], ['data/NOISE/stationary/cabin.wav', -3], ['data/NOISE/stationary/car.wav', 0], ['data/NOISE/stationary/dripping_water.wav', -6], ['data/NOISE/stationary/pink.wav', 9], ['data/NOISE/stationary/pink.wav', -6], ['data/NOISE/stationary/dripping_water.wav', -3], ['data/NOISE/stationary/cabin.wav', 0], ['data/NOISE/stationary/wind.wav', 9], ['data/NOISE/stationary/pink.wav', -9], ['data/NOISE/stationary/rain.wav', 6], ['data/NOISE/stationary/rain.wav', 6], ['data/NOISE/stationary/pink.wav', -6], ['data/NOISE/stationary/dripping_water.wav', 6], ['data/NOISE/stationary/typing.wav', -6], ['data/NOISE/stationary/cabin.wav', 3