In [58]:
# packages

import os
import sys
import pathlib
import re
import numpy as np

In [59]:
# paths

main_path = '/export/corpora/'
clean_audios = 'siwis_database/wav/'
noise_audios = 'musan'
info_clean_audios = 'siwis_database/info'

In [60]:
clean_audios_path = os.path.join(main_path, clean_audios)
noise_audios_path = os.path.join(main_path, noise_audios)

In [61]:
# utt2duration
utt2duration = {}

info_dir = os.listdir(os.path.join(main_path, info_clean_audios))
audio_length_files = [elem for elem in info_dir if re.search('_audio_length.txt', elem)]

with open('../data/utt2duration.scp', 'w') as outfile:
    durations = []
    for file in audio_length_files:
        for line in open(os.path.join(main_path, info_clean_audios, file)):
            utt, duration = line.rstrip().split()
            utt = utt.replace('.wav', '')
            durations.append(float(duration))
            outfile.write('{} {}\n'.format(utt, duration))
            utt2duration[utt] = float(duration)

outfile.close()
print('Clean audios measures:')
print('Max: {}\nMin: {}\nMean: {}\nMedian: {}'.format(max(durations), min(durations), np.mean(durations), np.median(durations)))

Clean audios measures:
Max: 147.512472
Min: 0.704399
Mean: 5.761135318841562
Median: 5.149864


In [62]:
# clean wav.scp
min_length = 4.0
counter = {}

with open('../data/clean_wav.scp', 'w') as file:
    for language in os.listdir(clean_audios_path):
        for folder in os.listdir(os.path.join(clean_audios_path, language)):
            for utt in os.listdir(os.path.join(clean_audios_path, language, folder)):
                tmp = os.path.join(clean_audios_path, language, folder, utt)
                if pathlib.Path(tmp).suffix == '.wav':
                    if utt2duration[utt.replace('.wav', '')] > min_length:
                        file.write('{} {}\n'.format(utt.replace('.wav', ''), tmp))
                        if language in counter:
                            counter[language] += 1
                        else:
                            counter[language] = 1

file.close()
print('Detected {} clean audios'.format(sum(counter.values())))
print(counter)

Detected 10997 clean audios
{'DE': 2093, 'EN': 2720, 'FR': 3906, 'IT': 2278}


In [63]:
# noise wav.scp
counter = {}

with open('../data/noise_wav.scp', 'w') as file:
    for folder in os.listdir(noise_audios_path):
        if os.path.isdir(os.path.join(noise_audios_path, folder)):
            for subfolder in os.listdir(os.path.join(noise_audios_path, folder)):
                if os.path.isdir(os.path.join(noise_audios_path, folder, subfolder)):
                    for utt in os.listdir(os.path.join(noise_audios_path, folder, subfolder)):
                        if utt.endswith('.wav'):
                            file.write('{} {}\n'.format(utt.replace('.wav', ''),
                                                        os.path.join(noise_audios_path, folder, subfolder, utt)))
                            if folder in counter:
                                counter[folder] += 1
                            else:
                                counter[folder] = 1

file.close()
print('Detected {} noise audios'.format(sum(counter.values())))
print(counter)

Detected 2016 noise audios
{'music': 660, 'noise': 930, 'speech': 426}


In [64]:
# utt2spk
list_of_speakers = set()
with open('../data/utt2spk.scp', 'w') as file:
    for line in open('../data/clean_wav.scp'):
        utt, path = line.split()
        spk = utt.split('_')[2]
        list_of_speakers.add(spk)
        file.write('{} {}\n'.format(utt, spk))
        
file.close()
print('{} speakers'.format(len(list_of_speakers)))

36 speakers
