In [7]:
import os

#os.environ['KALDI_ROOT'] = '/opt/kaldi/'

import kaldi_io
import numpy as np
import librosa
import IPython.display as ipd
import random
import kaldiio
from kaldiio import WriteHelper
from utils import add_noise

In [8]:
from tqdm import tqdm

clean_train_folder = '../data/train/'
clean_test_folder = '../data/test/'
serialized_train_folder = '../data/serialized_train_data/'
serialized_test_folder = '../data/serialized_test_data/'
window_size = 2 ** 14  # about 1 second of samples
sample_rate = 16000

In [9]:
def slice_signal(wav, window_size, stride):
    """
    Helper function for slicing the audio file
    by window size and sample rate with [1-stride] percent overlap (default 50%).
    """
    #wav, sr = librosa.load(file, sr=sample_rate)
    hop = int(window_size * stride)
    slices = []
    for end_idx in range(window_size, len(wav), hop):
        start_idx = end_idx - window_size
        slice_sig = wav[start_idx:end_idx]
        slices.append(slice_sig)
    return slices

In [73]:
def process_and_serialize(data_type):
    """
    Serialize, down-sample, augment the sliced signals and save on separate folder.
    
    """
    noise_choice = {'music':659, 'noise':929, 'speech':425}
    train_augmentation = 1
    test_augmentation = 1
    stride = .5
    
    if data_type == 'train':
        clean_folder = clean_train_folder
        serialized_folder = serialized_train_folder
        augmentation = train_augmentation
    else:
        clean_folder = clean_test_folder
        serialized_folder = serialized_test_folder
        augmentation = test_augmentation
        
    if not os.path.exists(serialized_folder):
        os.makedirs(serialized_folder)

    
    for line in tqdm(open(os.path.join(clean_folder, 'wav.scp')),
                     desc='Serialize and down-sample {} audios'.format(data_type)):
        
        utt, path = line.rstrip().split()
        clean_file, _ = librosa.load(path, sr = 16000)
        for noise_type in noise_choice:
            for aug in range(augmentation):
                noise_track = np.random.randint(0, noise_choice[noise_type])
                _, noise_path = open('../data/musan_{}.scp'.format(noise_type)).readlines()[noise_track].rstrip().split()
                noise_audio, _ = librosa.load(noise_path, sr = 16000)
                snr = random.choice([2.5, 7.5, 12.5, 17.5])
                noisy_file = add_noise(clean_file, noise_audio, snr=snr)

                # slice both clean signal and noisy signal
                clean_sliced = slice_signal(clean_file, window_size, stride)
                noisy_sliced = slice_signal(noisy_file, window_size, stride)
                # serialize - file format goes [original_file]_[noise_type]_[aug]_[slice_number].npy
                # ex) EN_C1_12_107.wav_0_5.npy denotes 5th slice of EN_C1_12_107.wav file in his augmentation version number 0
                for idx, slice_tuple in enumerate(zip(clean_sliced, noisy_sliced)):
                    pair = np.array([slice_tuple[0], slice_tuple[1]])
                    #print('{}.wav_{}_{}_{}'.format(utt, noise_type, aug, idx))
                    np.save(os.path.join(serialized_folder, '{}.wav_{}_{}_{}_{}'.format(utt, noise_type, snr, aug, idx)),
                            arr=pair)

In [69]:
def data_verify(data_type):
    """
    Verifies the length of each data after pre-process.
    """
    if data_type == 'train':
        serialized_folder = serialized_train_folder
    else:
        serialized_folder = serialized_test_folder

    for root, dirs, files in os.walk(serialized_folder):
        for filename in tqdm(files, desc='Verify serialized {} audios'.format(data_type)):
            data_pair = np.load(os.path.join(root, filename))
            if data_pair.shape[1] != window_size:
                print('Snippet length not {} : {} instead'.format(window_size, data_pair.shape[1]))
                break

In [65]:
process_and_serialize('train')

data_verify('train')

Serialize and down-sample train audios: 2448it [12:30,  3.26it/s]


In [70]:
process_and_serialize('test')

data_verify('test')

Serialize and down-sample test audios: 272it [01:21,  3.32it/s]
Verify serialized test audios: 100%|██████████| 8913/8913 [00:03<00:00, 2516.32it/s]


In [60]:
uno, dos = np.load('../data/serialized_train_data/EN_C1_12_107.wav_noise_2.5_0_4.npy')

In [72]:
os.listdir('../data/serialized_train_data/')

['EN_C1_12_107.wav_music_17.5_0_0.npy',
 'EN_C1_12_107.wav_music_17.5_0_1.npy',
 'EN_C1_12_107.wav_music_17.5_0_2.npy',
 'EN_C1_12_107.wav_music_17.5_0_3.npy',
 'EN_C1_12_107.wav_music_17.5_0_4.npy',
 'EN_C1_12_107.wav_music_17.5_0_5.npy',
 'EN_C1_12_107.wav_music_17.5_0_6.npy',
 'EN_C1_12_107.wav_music_17.5_0_7.npy',
 'EN_C1_12_107.wav_music_17.5_0_8.npy',
 'EN_C1_12_107.wav_music_17.5_0_9.npy',
 'EN_C1_12_107.wav_music_17.5_0_10.npy',
 'EN_C1_12_107.wav_music_17.5_0_11.npy',
 'EN_C1_12_107.wav_noise_17.5_0_0.npy',
 'EN_C1_12_107.wav_noise_17.5_0_1.npy',
 'EN_C1_12_107.wav_noise_17.5_0_2.npy',
 'EN_C1_12_107.wav_noise_17.5_0_3.npy',
 'EN_C1_12_107.wav_noise_17.5_0_4.npy',
 'EN_C1_12_107.wav_noise_17.5_0_5.npy',
 'EN_C1_12_107.wav_noise_17.5_0_6.npy',
 'EN_C1_12_107.wav_noise_17.5_0_7.npy',
 'EN_C1_12_107.wav_noise_17.5_0_8.npy',
 'EN_C1_12_107.wav_noise_17.5_0_9.npy',
 'EN_C1_12_107.wav_noise_17.5_0_10.npy',
 'EN_C1_12_107.wav_noise_17.5_0_11.npy',
 'EN_C1_12_107.wav_speech_12.5_0_0.n

In [76]:
try:
    os.path.listdir('../data/prova')
except:
    print('error')

error


In [61]:
ipd.Audio(uno, rate = 16000)

In [62]:
ipd.Audio(dos, rate = 16000)