## Audio processing using librosa

In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from configuration import get_config
from utils import keyword_spot

config = get_config()   # get arguments from parser

# downloaded dataset path
audio_path= '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48' # utterance dataset
clean_path = '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/clean-wav48' # clean dataset
#noisy_path = r'C:\Users\LG\Documents\Deep_learning\speaker_vertification\noisy_testset_wav'  # noisy dataset

Namespace(M=5, N=4, beta1=0.5, beta2=0.9, comment='', hidden=128, hop=0.01, iteration=100000, loss='softmax', lr=0.01, model_num=6, model_path='./tisv_model', nfft=512, noise_filenum=16, noise_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/noise', num_layer=3, optim='sgd', proj=64, restore=False, sr=8000, tdsv=False, tdsv_frame=80, test_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/test', tisv_frame=180, train=False, train_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/train', window=0.025)


In [34]:
def save_spectrogram_tisv():
    """ Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
        Each partial utterance is splitted by voice detection using DB
        and the first and the last 180 frames from each partial utterance are saved. 
        Need : utterance data set (VTCK)
    """
    print("start text independent utterance feature extraction")
    os.makedirs(config.train_path, exist_ok=True)   # make folder to save train file
    os.makedirs(config.test_path, exist_ok=True)    # make folder to save test file

    utter_min_len = (config.tisv_frame * config.hop + config.window) * config.sr    # lower bound of utterance length
    total_speaker_num = len(os.listdir(audio_path))
    train_speaker_num= (total_speaker_num//10)*9            # split total data 90% train and 10% test
    print("total speaker number : %d"%total_speaker_num)
    print("train : %d, test : %d"%(train_speaker_num, total_speaker_num-train_speaker_num))
    for i, folder in enumerate(os.listdir(audio_path)):
        if folder in c:
            #print(f'Not processing since {folder} is already processed')
            continue
        speaker_path = os.path.join(audio_path, folder)     # path of each speaker
        print("%dth speaker processing path:%s..."%(i,speaker_path))
        utterances_spec = []
        k=0
        for utter_name in os.listdir(speaker_path):
            #print(utter_name)
            utter_path = os.path.join(speaker_path, utter_name)         # path of each utterance
            utter, sr = librosa.core.load(utter_path, config.sr)        # load utterance audio
            intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection
            for interval in intervals:
                if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
                    utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
                    S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                                          win_length=int(config.window * sr), hop_length=int(config.hop * sr))
                    S = np.abs(S) ** 2
                    mel_basis = librosa.filters.mel(sr=config.sr, n_fft=config.nfft, n_mels=40)
                    S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances

                    utterances_spec.append(S[:, :config.tisv_frame])    # first 180 frames of partial utterance
                    utterances_spec.append(S[:, -config.tisv_frame:])   # last 180 frames of partial utterance
        print(f'Completed {speaker_path}')
        utterances_spec = np.array(utterances_spec)
        print(utterances_spec.shape)
        if i<train_speaker_num:      # save spectrogram as numpy file
            np.save(os.path.join(config.train_path, "speaker%d.npy"%(i+64)), utterances_spec)
        else:
            np.save(os.path.join(config.test_path, "speaker%d.npy"%(i-train_speaker_num)), utterances_spec)

In [None]:
# %%timeit
# # Run the preprocessing
# save_spectrogram_tisv()

## Data processing for inference

In [15]:
print("start text independent utterance feature extraction")
os.makedirs(config.train_path, exist_ok=True)   # make folder to save train file
os.makedirs(config.test_path, exist_ok=True)    # make folder to save test file

utter_min_len = (config.tisv_frame * config.hop + config.window) * config.sr    # lower bound of utterance length
total_speaker_num = len(os.listdir(audio_path))
train_speaker_num= (total_speaker_num//10)*9            # split total data 90% train and 10% test
print("total speaker number : %d"%total_speaker_num)
print("train : %d, test : %d"%(train_speaker_num, total_speaker_num-train_speaker_num))
for i, folder in enumerate(os.listdir(audio_path)):
    speaker_path = os.path.join(audio_path, folder)     # path of each speaker
    print("%dth speaker processing path:%s..."%(i,speaker_path))
    utterances_spec = []
    k=0
    for utter_name in os.listdir(speaker_path):
        #print(utter_name)
        utter_path = os.path.join(speaker_path, utter_name)         # path of each utterance
        utter, sr = librosa.core.load(utter_path, config.sr)        # load utterance audio
        intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection
        for interval in intervals:
            if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
                utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
                S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                                      win_length=int(config.window * sr), hop_length=int(config.hop * sr))
                S = np.abs(S) ** 2
                mel_basis = librosa.filters.mel(sr=config.sr, n_fft=config.nfft, n_mels=40)
                S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances

                utterances_spec.append(S[:, :config.tisv_frame])    # first 180 frames of partial utterance
                utterances_spec.append(S[:, -config.tisv_frame:])   # last 180 frames of partial utterance
        break
    
    print(f'Completed {speaker_path}')
    utterances_spec = np.array(utterances_spec)
    print(utterances_spec.shape)
    break
    # if i<train_speaker_num:      # save spectrogram as numpy file
    #     np.save(os.path.join(config.train_path, "speaker%d.npy"%(i+64)), utterances_spec)
    # else:
    #     np.save(os.path.join(config.test_path, "speaker%d.npy"%(i-train_speaker_num)), utterances_spec)

start text independent utterance feature extraction
total speaker number : 109
train : 90, test : 19
0th speaker processing path:/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48/p259...
Completed /datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48/p259
(4, 40, 180)


In [5]:
intervals

array([[11264, 31232],
       [34304, 61440]])

In [6]:
sr

8000

In [9]:
utter.shape

(66592,)

In [14]:
utter[11264:31232]

array([0.00127534, 0.00122136, 0.0006043 , ..., 0.00128434, 0.001479  ,
       0.00149367], dtype=float32)

In [11]:
librosa.get_duration(utter, sr)

8.324

In [33]:
f'Each frame in ms {8324/8000}'

'Each frame in ms 1.0405'

In [30]:
filename = librosa.util.example_audio_file()
y, sr = librosa.load(filename, sr=100, duration=10)

In [31]:
y.shape

(1000,)

In [25]:
y.shape

(491671,)

In [26]:
librosa.get_duration(y, sr)

61.458875

In [40]:
y.shape[0]/sr

10.0

In [34]:
path = "/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/test"
file = "speaker0.npy"
utters = np.load(os.path.join(path, file))

In [42]:
utters.shape

(334, 40, 180)

In [45]:
utter_batch=[]
utter_batch.append(utters[0: 4])

In [46]:
utter_batch = np.concatenate(utter_batch, axis=0)

In [48]:
utter_batch.shape

(4, 40, 180)

In [49]:
utter_batch = utter_batch[:,:,:160] 

In [50]:
utter_batch.shape

(4, 40, 160)

In [51]:
utter_batch = np.transpose(utter_batch, axes=(2,0,1))
utter_batch.shape

(160, 4, 40)

In [52]:
utter_batch

array([[[-2.96778699, -3.23434766, -4.39895536, ..., -5.99681126,
         -5.99729541, -5.99971237],
        [-1.68480869, -0.93685481, -1.12681371, ..., -2.13418344,
         -2.3309338 , -3.11818273],
        [-4.34632155, -4.47610423, -4.65433585, ..., -5.9924307 ,
         -5.99650542, -5.99639753],
        [-3.33079824, -2.90644971, -3.03378336, ..., -5.92655535,
         -5.97386883, -5.98753416]],

       [[-2.38258957, -3.13043269, -4.94321763, ..., -5.99693354,
         -5.99343982, -5.99907139],
        [-1.71488264, -0.87780576, -1.11357438, ..., -2.13701642,
         -2.20448508, -2.94353631],
        [-3.11931196, -4.3663856 , -5.63164155, ..., -5.99180185,
         -5.98991966, -5.99476769],
        [-2.50139942, -1.45316429, -1.28221885, ..., -5.99326127,
         -5.99495222, -5.99250987]],

       [[-2.67323416, -3.69146765, -5.10566556, ..., -5.99502541,
         -5.99688395, -5.99914167],
        [-1.72387589, -0.82287668, -1.06624966, ..., -2.21298825,
         -2.