In [None]:
import glob
import os
import subprocess
from pydub import AudioSegment
from audio import Audio

In [None]:
audio = Audio(cache_dir='/home/trungvd/.deep-speaker-wd/triplet-training')
sp_to_utt_test = train_test_sp_to_utt(audio, is_test=True)
print(len(sp_to_utt_test))

In [None]:
utts = []
for transcript_path in glob.glob('/home/trungvd/.deep-speaker-wd/LibriSpeech/test-clean/**/**/*.txt'):
    with open(transcript_path) as f:
        utts += f.read().split('\n')
utts = [u.split(' ', 1) for u in utts if len(u) > 0]
print(len(utts))

In [24]:
utts = sorted(utts, key=lambda s: len(s[1]))

In [25]:
print(utts[:10])

[['121-123852-0001', 'AY ME'], ['8555-292519-0002', 'VENICE'], ['237-134500-0025', 'OH EMIL'], ['672-122797-0033', 'A STORY'], ['7127-75947-0012', 'INDEED AH'], ['2094-142345-0041', 'DIRECTION'], ['2830-3980-0026', 'VERSE TWO'], ['260-123440-0001', 'POOR ALICE'], ['1089-134691-0018', 'AGAIN AGAIN'], ['237-134500-0001', 'MARIE SIGHED']]


In [35]:
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [41]:
root = '/home/trungvd/repos/speech-reconstruction/samples/librispeech'
os.makedirs(root, exist_ok=True)
outputs = []
for utt_id, transcript in utts[:100]:
    utt_ids = utt_id.split('-')
    fp = os.path.join('/home/trungvd/.deep-speaker-wd/LibriSpeech/test-clean/', utt_ids[0], utt_ids[1], utt_id + '.wav')
    subprocess.run(['ffmpeg', '-i', '%s.flac' % (fp[:-4]), fp])
    
    sound = AudioSegment.from_file(fp, format="wav")
    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())
    duration = len(sound)    
    trimmed_sound = sound[start_trim:duration-end_trim]
    trimmed_sound.export(fp, format='wav')
    
    with open(os.path.join(root, utt_id + '.csv'), 'w') as f:
        f.write('wav_filename,wav_filesize,transcript\n%s,0,%s' % (fp, transcript.lower()))
        
    outputs.append([utt_id, transcript, len(trimmed_sound)])
        
outputs.sort(key=lambda o: o[-1])
with open(os.path.join(root, 'transcript.txt'), 'w') as f:
    f.write('\n'.join([','.join([str(e) for e in o]) for o in outputs]))