In [1]:
"""
Alexander Brown
1/26/2022

We want to concatenate all of the audio clips for each person, 
and keep the results that fit our length preferences.

https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html
"""

import os
import soundfile as sf
import librosa
from pydub import AudioSegment

In [2]:
"""
determine the total length of .wav file duration available for each character
"""
# specify the parent directory containing all of the 'id' directories
# note that directory notation must use '/' rather than '\'
pc_specific_path = 'D:/ZaknafeinII/daea2022'
vox_dev_specific_path = '/vox1_dev_wav_partaa~/wav'
datafolder = pc_specific_path + vox_dev_specific_path

# this produces an iterable object containing all of the 'id' directories
character_ids = os.listdir(datafolder)

# this dict will store the total .wav file summed duration for each character
char_durations = {}

# iterate over each different character
for character_id in character_ids:
    
    summed_duration = 0
    
    curr_char_dir = datafolder+'/'+character_id

    # iterate over each audio directory belonging to that character
    for (dirpath, dirnames, filenames) in os.walk(curr_char_dir):
        
        # iterate over each .wav audio file in the audio directory
        for filename in filenames:
            filepath = dirpath+'/'+filename
            
            # add the file duration to the summed duration for the character
            summed_duration = summed_duration + librosa.get_duration(filename=filepath)
            char_durations[character_id] = summed_duration
            
    print('%s had a duration of %f seconds (%f minutes)' % (character_id, summed_duration, summed_duration/60))
    

    

id10004 had a duration of 969.526750 seconds (16.158779 minutes)
id10009 had a duration of 443.883687 seconds (7.398061 minutes)
id10011 had a duration of 983.327438 seconds (16.388791 minutes)
id10016 had a duration of 2393.055813 seconds (39.884264 minutes)
id10017 had a duration of 928.369500 seconds (15.472825 minutes)
id10018 had a duration of 1828.496813 seconds (30.474947 minutes)
id10019 had a duration of 534.484875 seconds (8.908081 minutes)
id10020 had a duration of 4509.836813 seconds (75.163947 minutes)
id10022 had a duration of 1212.613125 seconds (20.210219 minutes)
id10027 had a duration of 766.885437 seconds (12.781424 minutes)
id10030 had a duration of 675.925562 seconds (11.265426 minutes)
id10032 had a duration of 998.845875 seconds (16.647431 minutes)
id10035 had a duration of 984.888875 seconds (16.414815 minutes)
id10036 had a duration of 1109.248438 seconds (18.487474 minutes)
id10039 had a duration of 1211.885500 seconds (20.198092 minutes)
id10041 had a duratio

id10434 had a duration of 1785.453125 seconds (29.757552 minutes)
id10440 had a duration of 1198.449438 seconds (19.974157 minutes)
id10442 had a duration of 677.326563 seconds (11.288776 minutes)
id10445 had a duration of 1209.530250 seconds (20.158838 minutes)
id10446 had a duration of 857.686875 seconds (14.294781 minutes)
id10448 had a duration of 1368.809188 seconds (22.813486 minutes)
id10449 had a duration of 1823.653875 seconds (30.394231 minutes)
id10455 had a duration of 345.723875 seconds (5.762065 minutes)
id10460 had a duration of 1184.047562 seconds (19.734126 minutes)
id10463 had a duration of 965.487750 seconds (16.091462 minutes)
id10466 had a duration of 1279.929813 seconds (21.332164 minutes)
id10468 had a duration of 354.643312 seconds (5.910722 minutes)
id10473 had a duration of 1620.011625 seconds (27.000194 minutes)
id10476 had a duration of 663.403563 seconds (11.056726 minutes)
id10477 had a duration of 411.723500 seconds (6.862058 minutes)
id10481 had a durati

id10810 had a duration of 482.003937 seconds (8.033399 minutes)
id10811 had a duration of 447.883375 seconds (7.464723 minutes)
id10815 had a duration of 410.602938 seconds (6.843382 minutes)
id10817 had a duration of 1481.931187 seconds (24.698853 minutes)
id10820 had a duration of 1193.007063 seconds (19.883451 minutes)
id10830 had a duration of 698.406313 seconds (11.640105 minutes)
id10831 had a duration of 384.043750 seconds (6.400729 minutes)
id10836 had a duration of 606.285000 seconds (10.104750 minutes)
id10838 had a duration of 539.885000 seconds (8.998083 minutes)
id10839 had a duration of 1682.250625 seconds (28.037510 minutes)
id10840 had a duration of 1310.448938 seconds (21.840816 minutes)
id10841 had a duration of 766.366000 seconds (12.772767 minutes)
id10846 had a duration of 1670.610750 seconds (27.843512 minutes)
id10847 had a duration of 742.966625 seconds (12.382777 minutes)
id10854 had a duration of 1358.809375 seconds (22.646823 minutes)
id10856 had a duration o

id11220 had a duration of 1258.649937 seconds (20.977499 minutes)
id11222 had a duration of 611.364250 seconds (10.189404 minutes)
id11224 had a duration of 914.167937 seconds (15.236132 minutes)
id11227 had a duration of 499.124625 seconds (8.318744 minutes)
id11228 had a duration of 1033.728250 seconds (17.228804 minutes)
id11229 had a duration of 361.042813 seconds (6.017380 minutes)
id11232 had a duration of 1199.928687 seconds (19.998811 minutes)
id11233 had a duration of 1013.886812 seconds (16.898114 minutes)
id11234 had a duration of 1915.534000 seconds (31.925567 minutes)
id11239 had a duration of 1166.689000 seconds (19.444817 minutes)
id11241 had a duration of 664.846750 seconds (11.080779 minutes)
id11246 had a duration of 693.124938 seconds (11.552082 minutes)
id11248 had a duration of 1659.572750 seconds (27.659546 minutes)
id11250 had a duration of 775.245312 seconds (12.920755 minutes)
id11251 had a duration of 546.643812 seconds (9.110730 minutes)


In [8]:
count_over_30_min = 0
record = 0
for key in char_durations:
    if char_durations.get(key) > 1800:
        count_over_30_min = count_over_30_min + 1
        
        if char_durations.get(key) > record:
            record = char_durations.get(key)
        
print('%i/%i characters are over 30 minutes in duration' % (count_over_30_min, len(char_durations)))
print('the longest summed duration is %i minutes' % (record/60))


45/395 characters are over 30 minutes in duration
the longest summed duration is 4509


In [2]:
"""
sandbox to test combining .wav files

uses two specific .wav files from vox dev A

since we can use Python's sum() function to find the sum of a list, we should be able to
append all the .wav files to a list and then combine them
"""

# this block successfully combines two specified audio files
# https://stackoverflow.com/questions/2890703/how-to-join-two-wav-files-using-python
if True:
    sound1_filepath = 'F:/ZaknafeinII_Backup_02-02-22/daea/personalWaves/Recording0002.wav'
    sound2_filepath = 'F:/ZaknafeinII_Backup_02-02-22/daea/personalWaves/Recording0003.wav'
    # sound2_filepath = datafolder + '/id10004/6WxS8rpNjmk/00002.wav'
    # sounds_comb_filepath = pc_specific_path + '/testCombination.wav'
    sounds_comb_filepath = 'F:/ZaknafeinII_Backup_02-02-22/daea/personalWaves/ABrown_merged.wav'

    sound1 = AudioSegment.from_wav(sound1_filepath)
    sound2 = AudioSegment.from_wav(sound2_filepath)

    # sounds_comb = sound1 + sound2
    sounds_comb = sum([sound1, sound2])
    sounds_comb.export(sounds_comb_filepath, format='wav')

    sound1_dur = librosa.get_duration(filename=sound1_filepath)
    sound2_dur = librosa.get_duration(filename=sound2_filepath)
    sounds_comb_dur = librosa.get_duration(filename=sounds_comb_filepath)

    print('sound1 duration is %f seconds\nsound2 duration is %f seconds\nthe combined file duration is %f seconds' %
         (sound1_dur, sound2_dur, sounds_comb_dur))

sound1 duration is 3635.140000 seconds
sound2 duration is 1976.280000 seconds
the combined file duration is 5611.420000 seconds


In [7]:
"""
for each character with a length over 30 minutes (1800 seconds),
concatenate the .wav files to form a single, large .wav file

place all the merged .wav files in a parallel directory with names
corresponding to the character id
"""

# make the directory where the results will be stored
result_dir = 'mergedWaves'
os.mkdir(pc_specific_path + '/' + result_dir)

# iterate over each different character
for character_id in character_ids:
        
    # if the current character does not have audio files of duration >= 30 minutes, move on
    if char_durations.get(character_id) < 1800:
        continue
        
    curr_char_dir = datafolder+'/'+character_id
    
    # this tracks all the .wav files that need to be merged (all the .wav files belonging to the character)
    curr_char_wav_list = []

    # iterate over each audio directory belonging to that character
    for (dirpath, dirnames, filenames) in os.walk(curr_char_dir):
        
        # iterate over each .wav audio file in the audio directory
        for filename in filenames:
            filepath = dirpath+'/'+filename
            
            # add the .wav file to curr_char_wav_list so it can be combined later
            curr_char_wav_list.append(AudioSegment.from_wav(filepath))
            
    # now that all the .wav files to be combined have been collected in curr_char_wav_list
    # we can merge them and save the result
    output_path = pc_specific_path + '/' + result_dir + '/' + character_id + '_merged.wav'
    sounds_comb = sum(curr_char_wav_list)
    sounds_comb.export(output_path, format='wav')
    