In [1]:
import os, pickle
import logging
from collections import defaultdict
import librosa
import numpy as np
import matplotlib.pyplot as plt
# from configuration import get_config
# from utils import keyword_spot


In [2]:
dataset_name = 'voxceleb-1'
data_path = '/datadrive/dalon/diarizer-dataset/voxceleb-1/wav'
save_path = "/datadrive/dalon/diarizer-dataset/vctk-vox1n2-libri-npy"
# Create log dir if not exists
os.makedirs('logs', exist_ok=True)
log_file = f'logs/data-prep-{dataset_name}.log'
os.makedirs(save_path, exist_ok=True)
# Structure /aac/id0551/videoid/audio.mp4
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {os.path.abspath(log_file)}')
logging.info(f'Data path: {data_path}\nSave path: {save_path}')

Log path: /datadrive/dalon/diarization-experiments/Notebooks/data-prep/logs/data-prep-voxceleb-1.log


In [3]:

all_unique_extensions = []
all_files = defaultdict(list)
for base_id in os.listdir(data_path):
#     print(f'Base id: {base_id}')
    for video_id in os.listdir(os.path.join(data_path, base_id)):
#         print(f'Base id: {base_id} Video id: {video_id}')
        for audio_id in os.listdir(os.path.join(data_path, 
                                                base_id, 
                                                video_id)):
#             print(f'Base id: {base_id} Video id: {video_id} Audio id: {audio_id}')
            all_unique_extensions.append(os.path.splitext(audio_id)[1])
            if os.path.splitext(audio_id)[1] == '.wav':
                # append the file path and save path to all_files
                all_files[base_id].append(os.path.join(data_path, base_id, video_id, audio_id))
            else:
                print(f'Wrong file type in {os.path.join(data_path, base_id, video_id, audio_id)}')
print(f'Unique file extensions: {set(all_unique_extensions)}')

Unique file extensions: {'.wav'}


In [4]:
print(f'Number of utterences: {len(all_files)}')

Number of utterences: 1251


In [5]:
# for k, v in all_files.items():
#     print(k, v)
#     break

In [None]:
""" Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
    Each partial utterance is splitted by voice detection using DB
    and all the frames from each partial utterance are saved.
"""

# if tisv_frame=50, min length of utterance = 525ms
# save 160 i.e. 1625ms of frames
tisv_frame = 160 # max frame number of utterances of tdsv (lower values suffer)
window = 0.025 # 25ms
hop = 0.01 # 10ms This is frame level precision we will get
# pick the nfft atleast twice the size of window(whichs is the input) REF: https://stackoverflow.com/a/18080140/3959965
# ft kernel size, better to have in pow of 2
nfft = 512
sr = 16000
speaker_list = []

# all_files [[audio path], [save path], [number of segments]],
for k, v in all_files.items():
    save_audio_path = os.path.join(save_path, f'{dataset_name}_{k}.npy')
    utterances_spec = []
    try:
        for idx, item in enumerate(v):
            audio_path = item
            utter, sr = librosa.core.load(audio_path, sr=sr)        # load audio
            # Get the duration
            duration = librosa.get_duration(utter, sr)
            # Duration of each window
            duration_per_frame = (duration / utter.shape[0])
            utter_min_len = (tisv_frame * hop + window) * sr    # lower bound of utterance length
            #     logging.debug(f'Duration: {duration}\nMin length of utterance: {utter_min_len * duration_per_frame}s')

            logging.debug(f'Processing: {idx + 1}/{len(v)}')


            intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection (Below 20db is considered silence)
            for interval in intervals:
                if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
                    utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
            #                     logging.debug(f'{int(config.window * sr)},{int(config.hop * sr)}')
                    S = librosa.core.stft(y=utter_part, n_fft=nfft,
                                          win_length=int(window * sr), hop_length=int(hop * sr))
                    S = np.abs(S) ** 2
                    mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, n_mels=40)
                    S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances

                    utterances_spec.append(S[:, :tisv_frame])    # first 160 frames of partial utterance
                    utterances_spec.append(S[:, -tisv_frame:])   # last 160 frames of partial utterance
        utterances_spec = np.array(utterances_spec)
        if not utterances_spec.shape[0] == 0:
            speaker_list.append([k, utterances_spec.shape, save_audio_path])
            logging.debug(utterances_spec.shape)
            np.save(save_audio_path, utterances_spec)
    except Exception as e:
        logging.exception(e)
        logging.info(f'Failed in: {k}')

with open(dataset_name + '_utter_info.b', "wb") as f:
    # save the distribution
    logging.info(f'Saving processed audio list to {dataset_name}_utter_info.b')
    pickle.dump(speaker_list, f)
logging.info("Completed!")

In [7]:
with open(dataset_name + '_utter_info.b', "wb") as f:
    # save the distribution
    logging.info(f'Saving processed audio list to {dataset_name}_utter_info.b')
    pickle.dump(speaker_list, f)