In [1]:
# packages
import argparse
import os
import sys
import pathlib
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import librosa

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
# paths
# SIWIS database https://www.unige.ch/lettres/linguistique/research/latl/siwis/database/
# MUSAN dataset https://www.openslr.org/17/
main_path = '/export/corpora/' # abs path to where siwis_database and musan database are located
clean_audios = 'VoxCeleb2_test/aac/'
noise_audios = 'musan'
#info_clean_audios = 'siwis_database/info' 

In [3]:
clean_audios_path = os.path.join(main_path, clean_audios)
noise_audios_path = os.path.join(main_path, noise_audios)
print('Clean audios (siwis_database) directory: {}'.format(clean_audios_path))
print('Noise audios (musan dataset) directory: {}'.format(noise_audios_path))

Clean audios (siwis_database) directory: /export/corpora/VoxCeleb2_test/aac/
Noise audios (musan dataset) directory: /export/corpora/musan


In [None]:
# utt2duration

if not os.path.isdir('../data'):
    os.makedirs('../data')
    
if not os.path.isfile('../data/utt2duration.scp'):

    print('Making utt2duration.scp...')
    utt2duration = {}

    info_dir = os.listdir(os.path.join(main_path, info_clean_audios))
    audio_length_files = [elem for elem in info_dir if re.search('_audio_length.txt', elem)]

    with open('../data/utt2duration.scp', 'w') as outfile:
        durations = []
        for file in audio_length_files:
            for line in open(os.path.join(main_path, info_clean_audios, file)):
                utt, duration = line.rstrip().split()
                utt = utt.replace('.wav', '')
                durations.append(float(duration))
                outfile.write('{} {}\n'.format(utt, duration))
                utt2duration[utt] = float(duration)

    outfile.close()
    print('Clean audios measures:')
    print('Max: {}\nMin: {}\nMean: {}\nMedian: {}'.format(max(durations), min(durations),
                                                          np.mean(durations), np.median(durations)))
else:
    utt2duration = {}
    for line in open('../data/utt2duration.scp'):
        utt, duration = line.rstrip().split()
        utt2duration[utt] = float(duration)

    print('/data/utt2duration.scp file already exists.')

In [None]:
# make train and test wav.scp

if not os.path.isdir('../data/train'):
    os.makedirs('../data/train')
    
if not os.path.isfile('../data/train/wav.scp'):

    print('Making utt2duration.scp...')


min_length = 4.0
utts = []
paths = []
spks = []
for language in ['EN']#os.listdir(clean_audios_path):
    #if language == ['EN']:
    for folder in os.listdir(os.path.join(clean_audios_path, language)):
        for utt in os.listdir(os.path.join(clean_audios_path, language, folder)):
            tmp = os.path.join(clean_audios_path, language, folder, utt)
            if pathlib.Path(tmp).suffix == '.wav':
                if utt2duration[utt.replace('.wav', '')] > min_length:
                    utts.append(utt.replace('.wav', ''))
                    spks.append(utt.split('_')[2])
                    paths.append(tmp)

X_train, X_test, y_train, y_test = train_test_split(np.array(paths), np.array(utts), test_size = .1, stratify = spks)

if not os.path.isdir('../data/train'):
    os.makedirs('../data/train')

if not os.path.isfile('../data/train/wav.scp'):

    print('Making train...')
    with open('../data/train/wav.scp', 'w') as outfile:
        for train_counter, line in enumerate(np.column_stack((y_train, X_train))):
            outfile.write('{} {}\n'.format(line[0], line[1]))

    outfile.close()

else:
    print('../data/test/wav.scp already exists.')
    
if not os.path.isdir('../data/test'):
    os.makedirs('../data/test')

if not os.path.isfile('../data/test/wav.scp'):

    print('Making test...')
    with open('../data/test/wav.scp', 'w') as outfile:
        for test_counter, line in enumerate(np.column_stack((y_test, X_test))):
            outfile.write('{} {}\n'.format(line[0], line[1]))

    outfile.close()

    print('Detected {} audio files'.format(train_counter+test_counter))
    print('{} in train'.format(train_counter))
    print('{} in test'.format(test_counter))
    
else:
    print('../data/test/wav.scp already exists.')

In [None]:
# clean wav.scp
# print('Make clean_wav.scp')
# min_length = 4.0
# counter = {}

# with open('../data/clean_wav.scp', 'w') as file:
#     for language in os.listdir(clean_audios_path):
#         for folder in os.listdir(os.path.join(clean_audios_path, language)):
#             for utt in os.listdir(os.path.join(clean_audios_path, language, folder)):
#                 tmp = os.path.join(clean_audios_path, language, folder, utt)
#                 if pathlib.Path(tmp).suffix == '.wav':
#                     if utt2duration[utt.replace('.wav', '')] > min_length:
#                         file.write('{} {}\n'.format(utt.replace('.wav', ''), tmp))
#                         if language in counter:
#                             counter[language] += 1
#                         else:
#                             counter[language] = 1

# file.close()
# print('Detected {} clean audios'.format(sum(counter.values())))
# print(counter)

In [None]:
# noise wav.scp
print('\n\nChecking for noise wav.scp presence...')

if not os.path.isfile('../data/musan_noise.scp'):


    print('Make musan.scp')
    counter = {}

    for folder in os.listdir(noise_audios_path):
        if os.path.isdir(os.path.join(noise_audios_path, folder)):
            print('Making musan_{}.scp'.format(folder))
            with open('../data/musan_{}.scp'.format(folder), 'w') as file:
                for subfolder in os.listdir(os.path.join(noise_audios_path, folder)):
                    if os.path.isdir(os.path.join(noise_audios_path, folder, subfolder)):
                        for utt in os.listdir(os.path.join(noise_audios_path, folder, subfolder)):
                            if utt.endswith('.wav'):
                                file.write('{} {}\n'.format(utt.replace('.wav', ''),
                                                            os.path.join(noise_audios_path, folder, subfolder, utt)))
                                if folder in counter:
                                    counter[folder] += 1
                                else:
                                    counter[folder] = 1

            file.close()

    print('Detected {} noise audios'.format(sum(counter.values())))
    print(counter)
    
else:
    print('musan noise wav files already exists.')

In [None]:
# utt2spk
print('\n\nChecking for utt2spk.scp presence...')

if not os.path.isfile('../data/utt2spk.scp'):

    list_of_speakers = set()
    with open('../data/utt2spk.scp', 'w') as file:
        for line in open('../data/train/wav.scp'):
            utt, path = line.split()
            spk = utt.split('_')[2]
            list_of_speakers.add(spk)
            file.write('{} {}\n'.format(utt, spk))
            
        for line in open('../data/test/wav.scp'):
            utt, path = line.split()
            spk = utt.split('_')[2]
            list_of_speakers.add(spk)
            file.write('{} {}\n'.format(utt, spk))
    
    file.close()

    print('{} speakers'.format(len(list_of_speakers)))

else:
    print('/data/utt2spk.scp already exists.')

<hr>

In [None]:
## VoxCeleb pipeline

# with open('/opt/kaldi/egs/Signal-denoising-in-the-wild/data/utt2durationVoxCeleb.scp', 'w') as outfile:
#     for speaker in os.listdir(clean_audios_path):
#         for video in os.listdir(os.path.join(clean_audios_path, speaker)):
#             for audio in os.listdir(os.path.join(clean_audios_path, speaker, video)):
#                 duration = librosa.load(os.path.join(clean_audios_path, speaker, video, audio), sr = 16000)[0].shape[0]
#                 utt = os.path.join(speaker, video, audio)
#                 outfile.write('{} {}\n'.format(utt, duration))
# outfile.close()

# utt2duration = {}
# for line in open('/opt/kaldi/egs/Signal-denoising-in-the-wild/data/utt2durationVoxCeleb.scp'):
#     utt, duration = line.rstrip().split()
#     utt2duration[utt] = float(duration)

# min_length = 4.0 * 16000
# utts = []
# paths = []
# spks = []
# for speaker in os.listdir(clean_audios_path):
#     for video in os.listdir(os.path.join(clean_audios_path, speaker)):
#         for audio in os.listdir(os.path.join(clean_audios_path, speaker, video)):
#             tmp = os.path.join(clean_audios_path, speaker, video, audio)
            
#             if pathlib.Path(tmp).suffix == '.m4a':
#                 if utt2duration[os.path.join(speaker, video, audio)] > min_length:
#                     utts.append(os.path.join(speaker, video, audio).replace('.m4a', ''))
#                     spks.append(speaker)
#                     paths.append(tmp)

# X_train, X_test, y_train, y_test = train_test_split(np.array(paths), np.array(utts), test_size = .7, stratify = spks)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = .2)

# with open('/opt/kaldi/egs/Signal-denoising-in-the-wild/data/test/wavVoxCeleb.scp', 'w') as outfile:
#     for train_counter, line in enumerate(np.column_stack((y_test, X_test))):
#         outfile.write('{} {}\n'.format(line[0], line[1]))

# outfile.close()

# list_of_speakers = set()
# with open('/opt/kaldi/egs/Signal-denoising-in-the-wild/data/utt2spkVoxCeleb.scp', 'w') as file:
#     for line in open('/opt/kaldi/egs/Signal-denoising-in-the-wild/data/train/wavVoxCeleb.scp'):
#         utt, path = line.split()
#         spk = utt.split('/')[0]
#         list_of_speakers.add(spk)
#         file.write('{} {}\n'.format(utt, spk))

#     for line in open('/opt/kaldi/egs/Signal-denoising-in-the-wild/data/test/wavVoxCeleb.scp'):
#         utt, path = line.split()
#         spk = utt.split('/')[0]
#         list_of_speakers.add(spk)
#         file.write('{} {}\n'.format(utt, spk))

# file.close()

# print('{} speakers'.format(len(list_of_speakers)))
