In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display as display
import librosa.feature
import soundfile as sf
from scipy.signal import resample
import os
import sys
import torch
import cv2
import random
import math

: 

In [12]:
##################################
# audio generation utils
##################################

def extract_audio(filename):
    """
    Extract audio given the filename (.wav, .flac, etc format)
    """

    audio, rate = sf.read(filename, always_2d=True)
    audio = np.reshape(audio, (1, -1))
    audio = audio[0]
    time = np.linspace(0, len(audio)/rate, len(audio), endpoint=False)
    return audio, time, rate

def generate_spec(audio_sequence, rate, n_fft=2048, hop_length=512):
    """
    Generate spectrogram using librosa
    audio_sequence: list representing waveform
    rate: sampling rate (16000 for all LibriSpeech audios)
    nfft and hop_length: stft parameters
    """
    S = librosa.feature.melspectrogram(audio_sequence, sr=rate, n_fft=n_fft, hop_length=hop_length, n_mels=128, fmin=20,
                                       fmax=8300)
    log_spectra = librosa.power_to_db(S, ref=np.mean, top_db=80)
    return log_spectra

def reconstruct_wave(spec, rate=16000, normalize_data=False):
    """
    Reconstruct waveform
    spec: spectrogram generated using Librosa
    rate: sampling rate
    """
    power = librosa.db_to_power(spec, ref=5.0)
    audio = librosa.feature.inverse.mel_to_audio(power, sr=rate, n_fft=2048, hop_length=512)
    out_audio = audio / np.max(audio) if normalize_data else audio
    return out_audio

def normalize(spec, eps=1e-6):
    """
    Normalize spectrogram with zero mean and unitary variance
    spec: spectrogram generated using Librosa
    """

    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    return spec_norm, (mean, std)

def minmax_scaler(spec):
    """
    min max scaler over spectrogram
    """
    spec_max = np.max(spec)
    spec_min = np.min(spec)

    return (spec-spec_min)/(spec_max - spec_min), (spec_max, spec_min)

def linear_scaler(spec):
    """
    linear scaler over spectrogram
    min value -> -1 and max value -> 1
    """
    spec_max = np.max(spec)
    spec_min = np.min(spec)
    m = 2/(spec_max-spec_min)
    n = (spec_max + spec_min)/(spec_min-spec_max)

    return m*spec + n, (m, n)

def split_specgram(example, clean_example, frames = 11):
    """
    Split specgram in groups of frames, the purpose is prepare data for the LSTM model input

    example: reverberant spectrogram
    clean_example: clean or target spectrogram

    return data input to the LSTM model and targets
    """
    clean_spec = clean_example[0, :, :]
    rev_spec = example[0, :, :]

    n, m = clean_spec.shape

    targets = torch.zeros((m-frames+1, n))
    data = torch.zeros((m-frames+1, n*frames))
  
    idx_target = frames//2
    for i in range(m-frames+1):
        try:
            targets[i, :] = clean_spec[:, idx_target]
            data[i, :] = torch.reshape(rev_spec[:, i:i+frames], (1, -1))[0, :]
            idx_target += 1
        except (IndexError):
            pass
    return data, targets

def split_realdata(example, frames = 11):
    
    """
    Split 1 specgram in groups of frames, the purpose is prepare data for the LSTM and MLP model input

    example: reverberant ''real'' (not simulated) spectrogram

    return data input to the LSTM or MLP model 
    """
  
    rev_spec = example[0, :, :]
    n, m = rev_spec.shape
    data = torch.zeros((m-frames+1, n*frames))
    for i in range(m-frames+1):
        data[i, :] = torch.reshape(rev_spec[:, i:i+frames], (1, -1))[0, :]
    return data

def prepare_data(X, y, display = False):

    """
    Use split_specgram to split all specgrams
    X: tensor containing reverberant spectrograms
    y: tensor containing target spectrograms
    """

    data0, target0 = split_specgram(X[0, :, :, :], y[0, :, :, :])

    total_data = data0.cuda()
    targets = target0.cuda()
  
    for i in range(1, X.shape[0]):
           if display: 
               print("Specgram n°" + str(i)) 

           data_i, target_i = split_specgram(X[i, :, :, :], y[i, :, :, :])
           total_data = torch.cat((total_data, data_i.cuda()), 0)
           targets = torch.cat((targets, target_i.cuda()), 0)

    return  total_data, targets


def split_for_supression(rev_tensor, target_tensor):
    """
    Given reverberant and target tensor with shape (#examples, 1, 128, 340)
    return tensors with the same information, but with shape (#examples*340, 128)
    """
    rev_transform = torch.tensor([])
    target_transform = torch.tensor([])

    for example in range(rev_tensor.shape[0]):
        rev_transform = torch.cat((rev_transform, rev_tensor[example, 0, :, :].T))
    
    if (target_tensor!=None):
        for example in range(target_tensor.shape[0]):
            target_transform = torch.cat((target_transform, target_tensor[example, 0, :, :].T))
  
    return rev_transform, target_transform

def normalize_per_frame(spec_transpose):
    """
    Normalize over spectrogram rows
    """
    means = []
    stds = []
    norm_spec = torch.zeros(spec_transpose.shape)

    for spec_row in range(norm_spec.shape[0]):
        current_mean = spec_transpose[spec_row, :].mean()
        current_std = spec_transpose[spec_row, :].std()
        means.append(current_mean)
        stds.append(current_std)
        norm_spec[spec_row, :] = (spec_transpose[spec_row, :]- current_mean)/(current_std+1e-6) 
  
    return norm_spec, (means, stds)

def denormalize_per_frame(norm_spec_transpose, means, stds):
    """
    denormalize row by row using means and stds given by normalize_per_frame
    """
    denorm_spec = torch.zeros(norm_spec_transpose.shape)

    for spec_row in range(norm_spec_transpose.shape[0]):
        denorm_spec[spec_row, :] = (norm_spec_transpose[spec_row, :])*(stds[spec_row] + 1e-6) + means[spec_row]
    
    return denorm_spec.T


#################################
# reverberation utils
#################################

def zero_pad(x, k):
    """
    add k zeros to x signal
    """
    return np.append(x, np.zeros(k))


def awgn(signal, regsnr):
    """
    add random noise to signal
    regsnr: signal to noise ratio
    """
    sigpower = sum([math.pow(abs(signal[i]), 2) for i in range(len(signal))])
    sigpower = sigpower / len(signal)
    noisepower = sigpower / (math.pow(10, regsnr / 10))
    sample = np.random.normal(0, 1, len(signal))
    noise = math.sqrt(noisepower) * sample
    return noise


def discrete_conv(x, h, x_fs, h_fs, snr=30, aug_factor=1):
    """
    Convolution using fft
    x: speech waveform
    h: RIR waveform
    x_fs: speech signal sampling rate (if is not 16000 the signal will be resampled)
    h_fs: RIR signal sampling rate (if is not 16000 the signal will be resampled)

    Based on https://github.com/vtolani95/convolution/blob/master/reverb.py
    """

    numSamples_h = round(len(h) / h_fs * 16000)
    numSamples_x = round(len(x) / x_fs * 16000)

    if h_fs != 16000:
        h = resample(h, numSamples_h) # resample RIR

    if x_fs != 16000:
        x = resample(x, numSamples_x) # resample speech signal

    L, P = len(x), len(h)
    h_zp = zero_pad(h, L - 1)
    x_zp = zero_pad(x, P - 1)
    X = np.fft.fft(x_zp)
    output = np.fft.ifft(X * np.fft.fft(h_zp)).real
    output = aug_factor * output + x_zp
    output = output + awgn(output, snr)
    return output

###################################
#plot utils
###################################

def graph_spec(spec, rate=16000, title=False):
    """
    plot spectrogram
    spec: spectrogram generated using Librosa
    rate: sampling rate
    """
    plt.figure()
    display.specshow(spec, sr=rate, y_axis='mel', x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    if (title):
        plt.title('Log-Power spectrogram')
    plt.tight_layout()

def plot_time_wave(audio, rate=16000):
    """
    plot waveform given speech audio
    audio: array containing waveform
    rate: sampling rate

    """
    time = np.linspace(0, len(audio)/rate, len(audio), endpoint=False)
    plt.figure()
    plt.plot(time, audio)
    plt.xlabel("Time (secs)")
    plt.ylabel("Power")


# Data Download

In [14]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
!wget https://www.openslr.org/resources/12/train-clean-100.tar.gz
!tar -xf train-clean-100.tar.gz
!mv LibriSpeech/ LibriSpeechTrain/

In [None]:
!wget https://www.openslr.org/resources/12/test-clean.tar.gz
!tar -xf test-clean.tar.gz
!mv LibriSpeech/ LibriSpeechTest/

In [None]:
!wget isophonics.net/files/irs/classroomOmni.zip
!unzip classroomOmni
!mv Omni/ ClassroomOmni/

In [None]:
!wget isophonics.net/files/irs/octagonOmni.zip
!unzip octagonOmni
!mv Omni/ OctagonOmni/

In [None]:
!wget isophonics.net/files/irs/greathallOmni.zip
!unzip greathallOmni
!mv Omni/ GreatHallOmni/

In [None]:
!ls

In [10]:
def train_data(audio_dir, rir_dir, lower_bound, upper_bound, checkpointX, checkpointY):
    """
    Read training data generating reverberant waveforms and spectrograms
   
    audio_dir: directory containing the speech audio files
    rir_dir: directory containing RIRs audio files
    lower_bound: initial example to be considered
    upper_bound: final example to be considered
    checkpointX: directory + filename to save reverberant data
    checkpointY: directory + filename to save target data 
    """

    sys.path.append(audio_dir)
    sys.path.append(rir_dir)
    
    rir_file_names = []
    for subdir, dirs, files in os.walk(rir_dir):
        for file in files:
            if (".wav" in file):
                rir_file_names.append(os.path.join(subdir,file))

    audio_file_names = []
    for subdir, dirs, files in os.walk(audio_dir):
        for file in files:
            if (".flac" in file):
                audio_file_names.append(os.path.join(subdir,file))
    
    print ("RIRs found: " + str(len(rir_file_names)))
    print ("Audio files found: " + str(len(audio_file_names)))
    time_size = 340
    frequency_size = 128
    X = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))
    y = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))
    for i in range(lower_bound, upper_bound):
        rir_index = random.sample(range(len(rir_file_names)), 1)[0]
        ir_audio, ir_time, ir_rate = extract_audio(rir_file_names[rir_index])

        speech_audio, speech_time, speech_rate = extract_audio(audio_file_names[i])
        speech_spec = generate_spec(speech_audio, speech_rate)
        
        random_snr = random.sample(range(15, 36), 1)[0]
        speech_rev = discrete_conv(speech_audio, ir_audio, 16000, 96000, snr = random_snr)
        speech_rev = speech_rev[0:len(speech_audio)]
        rev_spec = generate_spec(speech_rev, speech_rate)
        
        speech_spec = cv2.resize(speech_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)
        rev_spec = cv2.resize(rev_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)

        print("Proccesing audio file n°: " + str(i+1))
        X[i-lower_bound, 0, :, :] = torch.tensor(rev_spec)
        y[i-lower_bound, 0, :, :] = torch.tensor(speech_spec)

        if ((i+1)%500 == 0):
          torch.save(X, checkpointX)
          torch.save(y, checkpointY)
          print('Saved data')

    return X, y

In [None]:
rir_rootdir1 = 'GreatHallOmni/'
audio_rootdir1 = 'LibriSpeechTrain/'
checkpointX1 = '/content/drive/My Drive/data_audio/non_norm_data/X_train_1.pth'
checkpointY1 = '/content/drive/My Drive/data_audio/non_norm_data/y_train_1.pth'
_, _ = train_data(audio_rootdir1, rir_rootdir1, 0, 5000, checkpointX1, checkpointY1)

In [None]:
rir_rootdir2 = 'OctagonOmni/'
audio_rootdir2 = 'LibriSpeechTrain/'
checkpointX2 = '/content/drive/My Drive/data_audio/non_norm_data/X_train_2.pth'
checkpointY2 = '/content/drive/My Drive/data_audio/non_norm_data/y_train_2.pth'
_, _ = train_data(audio_rootdir2, rir_rootdir2, 5000, 10000, checkpointX2, checkpointY2)

rir_rootdir3 = 'GreatHallOmni/'
audio_rootdir3 = 'LibriSpeechTrain/'
checkpointX3 = '/content/drive/My Drive/data_audio/non_norm_data/X_train_3.pth'
checkpointY3 = '/content/drive/My Drive/data_audio/non_norm_data/y_train_3.pth'
_, _ = train_data(audio_rootdir3, rir_rootdir3, 10000, 15000, checkpointX3, checkpointY3)

rir_rootdir4 = 'OctagonOmni/'
audio_rootdir4 = 'LibriSpeechTrain/'
checkpointX4 = '/content/drive/My Drive/data_audio/non_norm_data/X_train_4.pth'
checkpointY4 = '/content/drive/My Drive/data_audio/non_norm_data/y_train_4.pth'
_, _ = train_data(audio_rootdir4, rir_rootdir4, 15000, 20000, checkpointX4, checkpointY4)

In [None]:
def test_data(audio_dir, rir_dir, lower_bound, upper_bound, checkpoints, noise = [15, 35]):

    """
    read test data generating reverberant spectrograms and waveforms
    
    audio_dir: directory containing speech audio files
    rir_dir: directory
    lower_bound: initial example to be considered
    upper_bound: final example to be considered
    checkpoints: list containing directories for save rev spectrogram, target spectrogram
                rev waveforms and target waveforms respectively
    noise: add noise with random snr in [noise[0], noise[1]]
    """
    
    checkpointX = checkpoints[0]
    checkpointY = checkpoints[1]
    checkpoint_waverev = checkpoints[2]
    checkpoint_wavetarget = checkpoints[3]
    
    sys.path.append(audio_dir)
    sys.path.append(rir_dir)
    
    rir_file_names = []
    for subdir, dirs, files in os.walk(rir_dir):
        for file in files:
            if (".wav" in file):
                rir_file_names.append(os.path.join(subdir,file))

    audio_file_names = []
    for subdir, dirs, files in os.walk(audio_dir):
        for file in files:
            if (".flac" in file):
                audio_file_names.append(os.path.join(subdir,file))
    
    print ("RIRs found: " + str(len(rir_file_names)))
    print ("Audio files found: " + str(len(audio_file_names)))
    time_size = 340
    frequency_size = 128
    X = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))
    y = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))

    wave_data = []
    wave_targets = []


    for i in range(lower_bound, upper_bound):
        rir_index = random.sample(range(len(rir_file_names)), 1)[0]
        ir_audio, ir_time, ir_rate = extract_audio(rir_file_names[rir_index])

        speech_audio, speech_time, speech_rate = extract_audio(audio_file_names[i])
        wave_targets.append(speech_audio)
        speech_spec = generate_spec(speech_audio, speech_rate)
        
        random_snr = random.sample(range(noise[0], noise[1]), 1)[0]
        speech_rev = discrete_conv(speech_audio, ir_audio, 16000, 96000, snr = random_snr)
        speech_rev = speech_rev[0:len(speech_audio)]
        wave_data.append(speech_rev)
        rev_spec = generate_spec(speech_rev, speech_rate)
        
        speech_spec = cv2.resize(speech_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)
        rev_spec = cv2.resize(rev_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)

        print("Proccesing audio file n°: " + str(i+1))
        X[i-lower_bound, 0, :, :] = torch.tensor(rev_spec)
        y[i-lower_bound, 0, :, :] = torch.tensor(speech_spec)

        if ((i+1)%500 == 0):
          torch.save(X, checkpointX)
          torch.save(y, checkpointY)
          torch.save(wave_data, checkpoint_waverev)
          torch.save(wave_targets, checkpoint_wavetarget)
          print('Saved data')

    return X, y

In [None]:
def mardy_test_data(audio_dir, rir_dir, lower_bound, upper_bound, checkpoints, snr = 30, distance = 'far'):

    """
    read test data generating reverberant spectrograms and waveforms
    
    audio_dir: directory containing speech audio files
    rir_dir: directory contaning MARDY RIRs
    lower_bound: initial example to be considered
    upper_bound: final example to be considered
    checkpoints: list containing directories for save rev spectrogram, target spectrogram
                rev waveforms and target waveforms respectively
    snr: add awgn with snr
    """
    
    checkpointX = checkpoints[0]
    checkpointY = checkpoints[1]
    checkpoint_waverev = checkpoints[2]
    checkpoint_wavetarget = checkpoints[3]
    
    sys.path.append(audio_dir)
    sys.path.append(rir_dir)

    num_distance = '3' if distance == 'far' else '1'
    print('Distance Microphones ' + num_distance)
    
    rir_file_names = []
    for subdir, dirs, files in os.walk(rir_dir):
        for file in files:
            if (".wav" in file and file[3]==num_distance):
                rir_file_names.append(os.path.join(subdir,file))

    audio_file_names = []
    for subdir, dirs, files in os.walk(audio_dir):
        for file in files:
            if (".flac" in file):
                audio_file_names.append(os.path.join(subdir,file))
    
    print ("RIRs found: " + str(len(rir_file_names)))
    print ("Audio files found: " + str(len(audio_file_names)))
    time_size = 340
    frequency_size = 128
    X = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))
    y = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))

    wave_data = []
    wave_targets = []


    for i in range(lower_bound, upper_bound):
        rir_index = random.sample(range(len(rir_file_names)), 1)[0]
        ir_audio, ir_time, ir_rate = extract_audio(rir_file_names[rir_index])

        speech_audio, speech_time, speech_rate = extract_audio(audio_file_names[i])
        wave_targets.append(speech_audio)
        speech_spec = generate_spec(speech_audio, speech_rate)
        speech_rev = discrete_conv(speech_audio, ir_audio, 16000, 48000, aug_factor = 10, snr = snr)
        speech_rev = speech_rev[0:len(speech_audio)]
        wave_data.append(speech_rev)
        rev_spec = generate_spec(speech_rev, speech_rate)
        
        speech_spec = cv2.resize(speech_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)
        rev_spec = cv2.resize(rev_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)

        print("Proccesing audio file n°: " + str(i+1))
        X[i-lower_bound, 0, :, :] = torch.tensor(rev_spec)
        y[i-lower_bound, 0, :, :] = torch.tensor(speech_spec)

        if ((i+1)%500 == 0):
          torch.save(X, checkpointX)
          torch.save(y, checkpointY)
          torch.save(wave_data, checkpoint_waverev)
          torch.save(wave_targets, checkpoint_wavetarget)
          print('Saved data')

    return X, y

In [None]:
rir_rootdir = 'ClassroomOmni/'
audio_rootdir = 'LibriSpeechTest/'
checkpointX = '/content/drive/My Drive/data_audio/non_norm_data/X_test.pth'
checkpointY = '/content/drive/My Drive/data_audio/non_norm_data/y_test.pth'
checkpoint_waverev = '/content/drive/My Drive/data_audio/non_norm_data/waverev.pth'
checkpoint_wavetarget = '/content/drive/My Drive/data_audio/non_norm_data/wavetarget.pth'

checkpoints = [checkpointX, checkpointY, checkpoint_waverev, checkpoint_wavetarget]

X, y = test_data(audio_rootdir, rir_rootdir, 0, 500, checkpoints)

In [None]:
rir_rootdir = '/content/drive/My Drive/data_espec/MARDY'
audio_rootdir = 'LibriSpeechTest/'
checkpointX = '/content/drive/My Drive/data_audio/non_norm_data/X_test_2.pth'
checkpointY = '/content/drive/My Drive/data_audio/non_norm_data/y_test_2.pth'
checkpoint_waverev = '/content/drive/My Drive/data_audio/non_norm_data/waverev_2.pth'
checkpoint_wavetarget = '/content/drive/My Drive/data_audio/non_norm_data/wavetarget_2.pth'

checkpoints = [checkpointX, checkpointY, checkpoint_waverev, checkpoint_wavetarget]

X, y = mardy_test_data(audio_rootdir, rir_rootdir, 500, 1000, checkpoints, snr = 30)

In [None]:
rir_rootdir = '/content/drive/My Drive/data_espec/MARDY'
audio_rootdir = 'LibriSpeechTest/'
checkpointX = '/content/drive/My Drive/data_audio/non_norm_data/X_test_3.pth'
checkpointY = '/content/drive/My Drive/data_audio/non_norm_data/y_test_3.pth'
checkpoint_waverev = '/content/drive/My Drive/data_audio/non_norm_data/waverev_3.pth'
checkpoint_wavetarget = '/content/drive/My Drive/data_audio/non_norm_data/wavetarget_3.pth'

checkpoints = [checkpointX, checkpointY, checkpoint_waverev, checkpoint_wavetarget]

X, y = mardy_test_data(audio_rootdir, rir_rootdir, 500, 1000, checkpoints, snr = 30, distance = 'near')

In [None]:
rir_rootdir = 'ClassroomOmni/'
audio_rootdir = 'LibriSpeechTest/'
checkpointX = '/content/drive/My Drive/data_audio/non_norm_data/X_test_4.pth'
checkpointY = '/content/drive/My Drive/data_audio/non_norm_data/y_test_4.pth'
checkpoint_waverev = '/content/drive/My Drive/data_audio/non_norm_data/waverev_4.pth'
checkpoint_wavetarget = '/content/drive/My Drive/data_audio/non_norm_data/wavetarget_4.pth'
checkpoints = [checkpointX, checkpointY, checkpoint_waverev, checkpoint_wavetarget]
X, y = test_data(audio_rootdir, rir_rootdir, 0, 500, checkpoints, noise = [35, 36])

In [None]:
rir_rootdir = 'ClassroomOmni/'
audio_rootdir = 'LibriSpeechTest/'
checkpointX = '/content/drive/My Drive/data_audio/non_norm_data/X_test_5.pth'
checkpointY = '/content/drive/My Drive/data_audio/non_norm_data/y_test_5.pth'
checkpoint_waverev = '/content/drive/My Drive/data_audio/non_norm_data/waverev_5.pth'
checkpoint_wavetarget = '/content/drive/My Drive/data_audio/non_norm_data/wavetarget_5.pth'
checkpoints = [checkpointX, checkpointY, checkpoint_waverev, checkpoint_wavetarget]
X, y = test_data(audio_rootdir, rir_rootdir, 0, 500, checkpoints, noise = [15, 16])

In [None]:
def realdata_from_dir(audio_dir, lower_bound, upper_bound, checkpointX, checkpoint_wave):
    """
    read real test data with reverberant spectrograms and waveforms
    
    audio_dir: directory containing speech audio files
    lower_bound: initial example to be considered
    upper_bound: final example to be considered
    checkpointX: directory + filename to save reverberant spectrograms
    checkpointX: directory + filename to save reverberant waveforms
    """
    
    sys.path.append(audio_dir)

    audio_file_names = []
    for subdir, dirs, files in os.walk(audio_dir):
        for file in files:
            if (".wav" in file):
                audio_file_names.append(os.path.join(subdir,file))

    print ("Archivos de audio encontrados: " + str(len(audio_file_names)))
    time_size = 340
    frequency_size = 128

    X = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))
    y = torch.zeros((upper_bound-lower_bound, 1, frequency_size, time_size))
    
    waves_rev = []
    for i in range(lower_bound, upper_bound):

        speech_rev, speech_time, speech_rate = extract_audio(audio_file_names[i])
        waves_rev.append(speech_rev)
        rev_spec = generate_spec(speech_rev, speech_rate)
        rev_spec = cv2.resize(rev_spec, dsize = (time_size, frequency_size), interpolation = cv2.INTER_LANCZOS4)

        print("Procesado archivo de audio n°: " + str(i+1))
        X[i-lower_bound, 0, :, :] = torch.tensor(rev_spec)
  
        if ((i+1)%50 == 0):
          torch.save(X, checkpointX)
          torch.save(waves_rev, checkpoint_wave)
          print('Saved data')

    return X

In [None]:
audio_rootdir = '/content/drive/My Drive/masive_data/VUT_FIT_L207/MicID01/SpkID01_20171225_T/01/'
checkpointX = '/content/drive/My Drive/real_data/X_test_real1.pth'
checkpoint_wave = '/content/drive/My Drive/real_data/waves1.pth'
X = realdata_from_dir(audio_rootdir, 0, 500, checkpointX, checkpoint_wave)

In [None]:
audio_rootdir = '/content/drive/My Drive/masive_data/VUT_FIT_L207/MicID01/SpkID01_20171225_T/10/'
checkpointX = '/content/drive/My Drive/real_data/X_test_real2.pth'
checkpoint_wave = '/content/drive/My Drive/real_data/waves2.pth'
X = realdata_from_dir(audio_rootdir, 0, 500, checkpointX, checkpoint_wave)