In [1]:
import os, sys, shutil, logging, pickle
from glob import glob
import numpy as np
import librosa

In [2]:
dataset_name = 'vctk'
log_file = f'data-prep-{dataset_name}.log'
data_path = "/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48"
save_path = os.path.join(os.path.dirname(data_path), 'cleaned-data')
os.makedirs(save_path, exist_ok=True)
# Structure /aac/id0551/videoid/audio.mp4
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {os.path.abspath(log_file)}')
logging.info(f'Save path: {save_path}')

Log path: /datadrive2/dalon/diarization-experiments/Speaker_Verification/Notebooks/data-prep/data-prep-vctk.log


In [4]:
# if tisv_frame=50, min length of utterance = 525ms
tisv_frame = 50 # max frame number of utterances of tdsv (lower values suffer)
window = 0.025 # 25ms
hop = 0.01 # 10ms This is frame level precision we will get
# pick the nfft atleast twice the size of window(whichs is the input) REF: https://stackoverflow.com/a/18080140/3959965
# ft kernel size, better to have in pow of 2
nfft = 512
speaker_list = []

counter = 0

In [5]:
for speaker in os.listdir(data_path):
    save_audio_path = os.path.join(save_path, os.path.join(save_path, speaker + ".npy"))
    utterances_spec = []
    for idx, audio_path in enumerate(glob(os.path.join(data_path,speaker + "/*.wav"))):
        logging.info(f'Processing {audio_path}')
        """ Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
        Each partial utterance is splitted by voice detection using DB
        and all the frames from each partial utterance are saved.
        """
#         save_audio_path = os.path.join(save_path, os.path.splitext(os.path.basename(audio_path))[0] + ".npy")
        try:
            # since all other datasets are in 16kHz, downsample these 44kHz to 16kHz
            utter, sr = librosa.core.load(audio_path, sr=16000)        # load audio
            # Get the duration
            duration = librosa.get_duration(utter, sr)
            # Duration of each window
            duration_per_frame = (duration / utter.shape[0])
            utter_min_len = (tisv_frame * hop + window) * sr    # lower bound of utterance length
            #     logging.debug(f'Duration: {duration}\nMin length of utterance: {utter_min_len * duration_per_frame}s')

    #         logging.debug(f'Processing: {idx + 1}/{len(all_files)}')
#             utterances_spec = []

            intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection (Below 20db is considered silence)
            for interval in intervals:
                if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
                    utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
            #                     logging.debug(f'{int(config.window * sr)},{int(config.hop * sr)}')
                    S = librosa.core.stft(y=utter_part, n_fft=nfft,
                                          win_length=int(window * sr), hop_length=int(hop * sr))
                    S = np.abs(S) ** 2
                    mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, n_mels=40)
                    S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances

                    prev_tisv_frame = 0
                    for i in range(1, S.shape[1]//tisv_frame + 1):
                        utterances_spec.append(S[:, prev_tisv_frame:tisv_frame * i])
                        prev_tisv_frame = tisv_frame * i

    #                 utterances_spec.append(S[:, :tisv_frame])    # first 180 frames of partial utterance
    #                 utterances_spec.append(S[:, -tisv_frame:])   # last 180 frames of partial utterance

        except Exception as e:
            logging.exception(e)
            logging.info(f'Failed in: {audio_path}')

    utterances_spec = np.array(utterances_spec)
#         os.makedirs(os.path.dirname(save_audio_path), exist_ok=True)
    if not utterances_spec.shape[0] == 0:
        logging.debug(utterances_spec.shape)
        # this will consists all the utterances for that chapter
        speaker_list.append([audio_path, utterances_spec.shape, save_audio_path])
        np.save(save_audio_path, utterances_spec)
#         break
with open(os.path.join(save_path, dataset_name + '.b'), "wb") as f:
    # save the distribution
    logging.info(f'Saving processed audio list to {os.path.join(save_path, dataset_name + ".b")}')
    pickle.dump(speaker_list, f)
logging.info("Completed!")
        

In [6]:
speaker_list[:3]

[['/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48/p259/p259_384.wav',
  (1528, 40, 50),
  '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/cleaned-data/p259.npy'],
 ['/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48/p315/p315_158.wav',
  (427, 40, 50),
  '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/cleaned-data/p315.npy'],
 ['/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48/p269/p269_208.wav',
  (1256, 40, 50),
  '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/cleaned-data/p269.npy']]