In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import os, argparse
import numpy as np
import h5py
import pickle
import librosa
from scipy import interpolate
import matplotlib.pyplot as plt
from scipy.signal import decimate, butter, lfilter
import IPython.display as ipd
from scipy import signal
from ops import silence_filtering, upsample
from tqdm.notebook import tqdm
from keras.applications import MobileNet
from keras.models import Model
import re
from skimage.transform import resize
import librosa

Using TensorFlow backend.


In [2]:
dimension = 8192 #--dimension -> dimension of patches --use -1 for no patching
sr = 16000 #args.sr -> audio sampling rate
scale = 4 #args.scale -> scaling factor
low_pass = True #args.low_pass -> apply low-pass filter when generating low-res patches
stride = 2048  #args.stride -> 8192*0.75 = 2048 (Time Frequency Networks For Audio Super-Resolu)
batch_size = 128 # sia tfnet che kuleshov usano 128
trim_silence = False
silence_trash = 0 #DA DEFINIRE

In [3]:
def silence_filtering(sig, top_db):
    #deve restituire solo il segnale filtrato
    filt_sig, _ = librosa.effects.trim(sig, top_db=trim_silence,  frame_length=2048, hop_length=512)
    return filt_sig

In [4]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
output_dir = "\\processedData\\speaker1\\train&validation\\"
out = ROOT_DIR + output_dir #args.out -> path to output h5 archive

In [5]:
# id_list = [id_ for id_ in os.listdir(ROOT_DIR + data_path)] #id_list conta 109 speacker
# first_audio = ROOT_DIR + data_path + id_list[0] + '\\p225_366.wav'
# sig, rate = librosa.load(first_audio, sr=sr, mono=False)
# plt.plot(sig)
# ipd.Audio(sig, rate= rate)

In [7]:
def create_data(how = 'training'):
    
    if how == 'training':
        tr = True
    elif how == 'validation':
        tr = False
    else:
        raise ValueError('you should choose between training or validation')
        return None #necessario?
    
    data_dir = r'C:\Users\Giobi\Tesi\rawData\VCTK-Corpus\wav48\p225'
    in_dir = r'C:\Users\Giobi\Tesi\processedData\speaker1'
    if tr:
        inputfiles = in_dir + '\speaker1-train-files.txt' 
    else:
        inputfiles = in_dir + '\speaker1-val-files.txt' 
    file_list = []
    ID_list = []
    file_extensions = set(['.wav'])
    save_examples=False
    with open(inputfiles) as f:
        for line in f:
            filename = line.strip()
            ext = os.path.splitext(filename)[1]
            if ext in file_extensions:
                file_list.append(os.path.join(data_dir, filename))
    num_files = len(file_list)
    # patches to extract and their size
    d, d_lr = dimension, dimension
    s, s_lr = stride, stride
    hr_patches, lr_patches = list(), list()

    for j, file_path in enumerate(file_list):
        ID = int(re.search('p\d\d\d', file_path).group(0)[1:]) #originariamente era int(re.search('p\d\d\d/', file_path).group(0)[1:-1])
        # L'impostazione originale aveva due problemi. La regex ritornava un NoneType object, perciò è stato rimosso il backslash finale. 
        # Inoltre per ricavare l'ID dello speacker (es. p255 -> ID = 255), originariamente si aveva .group(0)[1:-1]. Tuttavia in quel caso veniva ID = 22 e non 225    

        # load audio file
        x, fs = librosa.load(file_path, sr=sr)
        
        if ((how == 'training')&(trim_silence == True)):
            x = silence_filtering(x)
        
        # crop so that it works with scaling ratio
        x_len = len(x)
        x = x[ : x_len - (x_len % scale)] #sostanzialmente questa operazione permette di ottenere una lunghezza di x (numero di campioni) adeguata allo scaling ratio.
        # Es: scale = 2 -> se il numero di campioni (lunghezza di x) è pari, allora non succede nulla. Se è dispari, invece, l'ultimo campione viene rimosso. 

        # generate low-res version
        if low_pass:
            x_lr = decimate(x, scale)
        else:
            x_lr = np.array(x[0::scale]) # la lunghezza è pari a x/scale (approssimazione per eccesso). 
                                         # Sostanzialmente in questo modo si prendono campioni a salti. 
                                         # Es. a = np.arange(5)
                                         #     a[0::2] -> array([0, 2, 4])
        x_lr = upsample(x_lr, scale) #interpolate low-res patches with cubic splines. 
                                     #After this line of code len(x_lr) is equal to len(x)
        assert len(x) % scale == 0
        assert len(x_lr) == len(x)
        assert x.dtype == np.float32
        assert x_lr.dtype == np.float32

        # generate patches
        max_i = len(x) - d + 1
        for i in range(0, max_i, s):
            i_lr = i
            hr_patch = np.array( x[i : i+d] )
            lr_patch = np.array( x_lr[i_lr : i_lr+d_lr] )

            assert len(hr_patch) == d
            assert len(lr_patch) == d_lr

            hr_patches.append(hr_patch.reshape((d,1)))
            lr_patches.append(lr_patch.reshape((d_lr,1)))
            ID_list.append(ID)
    
    # crop # of patches so that it's a multiple of mini-batch size
    num_patches = len(hr_patches)
    num_to_keep = int(np.floor(num_patches / batch_size) * batch_size)
    hr_patches = np.array(hr_patches[:num_to_keep])
    lr_patches = np.array(lr_patches[:num_to_keep])
    ID_list = ID_list[:num_to_keep]
        
    if tr: 
        h5_file = h5py.File(out + 'train_data.hdf5', 'w')
    else:
        h5_file = h5py.File(out + 'validation_data.hdf5', 'w') 
        
    # create the hdf5 file
    data_set_lr = h5_file.create_dataset('data_lr', lr_patches.shape, np.float32)
    data_set_lr[...] = lr_patches
    label_set = h5_file.create_dataset('label', hr_patches.shape, np.float32)
    label_set[...] = hr_patches

    file = open(out + 'ID_list_patches_' + str(d) + '_' + str(scale), 'wb')
    pickle.dump(ID_list, file)
    file.close()

In [8]:
create_data(how = 'training')
create_data(how = 'validation')

In [9]:
hf_train = h5py.File(out + 'train_data.hdf5', 'r')
X_train = np.array(hf_train.get('data_lr')).astype('float32')
Y_train = np.array(hf_train.get('label')).astype('float32')

hf_val = h5py.File(out + 'validation_data.hdf5', 'r')
X_val = np.array(hf_val.get('data_lr')).astype('float32')
Y_val = np.array(hf_val.get('label')).astype('float32')

In [11]:
X_train.shape

(6656, 8192, 1)

In [12]:
X_val.shape

(768, 8192, 1)