In [1]:
import os, pickle
import logging
import librosa
import numpy as np
import matplotlib.pyplot as plt
from configuration import get_config
from utils import keyword_spot


Namespace(M=5, N=4, beta1=0.5, beta2=0.9, comment='', hidden=128, hop=0.01, iteration=100000, loss='softmax', lr=0.01, model_num=6, model_path='./tisv_model', nfft=512, noise_filenum=16, noise_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/noise', num_layer=3, optim='sgd', proj=64, restore=False, sr=8000, tdsv=False, tdsv_frame=80, test_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/test', tisv_frame=180, train=False, train_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/train', window=0.025)


In [2]:
dataset_name = 'voxceleb-2'
data_path = '/datadrive2/dalon/diarization-experiments/voxceleb-dataset/voxceleb-2/dev/aac'
save_path = os.path.join(os.path.dirname(data_path), 'prepared-data')
log_file = 'data-prep.log'
os.makedirs(save_path, exist_ok=True)
# Structure /aac/id0551/videoid/audio.mp4
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {os.path.abspath(log_file)}')
logging.info(f'Data path: {data_path}\nSave path: {save_path}')

Log path: /datadrive2/dalon/diarization-experiments/Speaker_Verification/Notebooks/data-prep.log


In [3]:
all_unique_extensions = []
all_files = []
for base_id in os.listdir(data_path):
#     print(f'Base id: {base_id}')
    for video_id in os.listdir(os.path.join(data_path, base_id)):
#         print(f'Base id: {base_id} Video id: {video_id}')
        for audio_id in os.listdir(os.path.join(data_path, 
                                                base_id, 
                                                video_id)):
#             print(f'Base id: {base_id} Video id: {video_id} Audio id: {audio_id}')
            all_unique_extensions.append(os.path.splitext(audio_id)[1])
            if os.path.splitext(audio_id)[1] == '.m4a':
                # append the file path and save path to all_files
                all_files.append([os.path.join(data_path, base_id, video_id, audio_id),
                                    os.path.join(save_path, base_id, video_id,
                                                os.path.splitext(audio_id)[0]) + '.npy'])
            else:
                print(f'Wrong file type in {os.path.join(data_path, base_id, video_id, audio_id)}')
print(f'Unique file extensions: {set(all_unique_extensions)}')

Unique file extensions: {'.m4a'}


In [4]:
print(f'Number of utterences: {len(all_files)}')

Number of utterences: 1092009


In [None]:
""" Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
    Each partial utterance is splitted by voice detection using DB
    and all the frames from each partial utterance are saved.
"""

# if tisv_frame=50, min length of utterance = 525ms
tisv_frame = 50 # max frame number of utterances of tdsv (lower values suffer)
window = 0.025 # 25ms
hop = 0.01 # 10ms This is frame level precision we will get
# pick the nfft atleast twice the size of window(whichs is the input) REF: https://stackoverflow.com/a/18080140/3959965
# ft kernel size, better to have in pow of 2
nfft = 512

# all_files [[audio path], [save path], [number of segments]], 
for idx, item in enumerate(all_files):
    try:
        audio_path = item[0]
        save_audio_path = item[1]
        utter, sr = librosa.core.load(audio_path, sr=None)        # load audio
        # Get the duration
        duration = librosa.get_duration(utter, sr)
        # Duration of each window
        duration_per_frame = (duration / utter.shape[0])
        utter_min_len = (tisv_frame * hop + window) * sr    # lower bound of utterance length
        #     logging.debug(f'Duration: {duration}\nMin length of utterance: {utter_min_len * duration_per_frame}s')

        logging.debug(f'Processing: {idx + 1}/{len(all_files)}')
        utterances_spec = []

        intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection (Below 20db is considered silence)
        for interval in intervals:
            if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
                utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
        #                     logging.debug(f'{int(config.window * sr)},{int(config.hop * sr)}')
                S = librosa.core.stft(y=utter_part, n_fft=nfft,
                                      win_length=int(window * sr), hop_length=int(hop * sr))
                S = np.abs(S) ** 2
                mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, n_mels=40)
                S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
                
                prev_tisv_frame = 0
                for i in range(1, S.shape[1]//tisv_frame + 1):
                    utterances_spec.append(S[:, prev_tisv_frame:tisv_frame * i])
                    prev_tisv_frame = tisv_frame * i

#                 utterances_spec.append(S[:, :tisv_frame])    # first 180 frames of partial utterance
#                 utterances_spec.append(S[:, -tisv_frame:])   # last 180 frames of partial utterance

        utterances_spec = np.array(utterances_spec)
        all_files[idx].append(utterances_spec.shape)
        logging.debug(utterances_spec.shape)
        os.makedirs(os.path.dirname(save_audio_path), exist_ok=True)
        np.save(save_audio_path, utterances_spec)
#         break
    except Exception as e:
        logging.exception(e)
        logging.info(f'Failed in: {all_files[idx]}')

with open(os.path.join(os.path.dirname(data_path), dataset_name + '.b'), "wb") as f:
    # save the distribution
    logging.info(f'Saved the all files list to {os.path.join(os.path.dirname(data_path), dataset_name + ".b")}')
    pickle.dump(all_files, f)
logging.info("Completed!")