In [None]:
# N-Best segmentation script - Dutch Broadcast News (BN-NL) subset
from pydub import AudioSegment

# Paths to the audio files directory, the segmented audio files output directory,
# and the segmented STM files output directory
# !!! MAKE SURE TO CHANGE THESE IF YOU INTEND TO RUN THE CODE !!!
audio_path = '/home/jovyan/evaluation/nbest/bn_nl/'
out_path = '/home/jovyan/evaluation/nbest/bn_nl_segments/'
stm_path = '/home/jovyan/evaluation/nbest/bn_nl_seg_stm/'

# Open the STM file which contains all the utterances of the bn-nl part of N-Best
# !!! MAKE SURE TO CHANGE THIS TO THE PATH OF THE STM FILE !!!
with open('/home/jovyan/evaluation/nbest/nbest-eval-2008-bn-nl.stm') as file:
    # Read the utterances
    lines = file.readlines()
    # Keeps track of the audio file we are currently processing
    # Used for keeping consistency with the naming of the output files
    audiofile = ''
    # Keeps track of the start time, used for proper segmentation
    start_time = 0.0
    # Keeps track of the current end time, used for proper segmentation
    end_time = 0.0
    # Variable that stores the actual audio array
    audio = ''
    # Keeps track of the current segment we are at, used for naming of
    # the output files
    seg_num = 0
    # Keeps track of the utterances in the STM corresponding to the current
    # segment, used for saving the corresponding STM file of the segment
    f = 'no file yet'

    # Iterate thru the lines
    for line in lines:
        # Split the line on whitespace (whitespace is used as a delimiter for the
        # columns of the STM file
        segments = line.split(' ')
        # If the current utterance's file ID has changed
        if segments[0] != audiofile:
            # If there is an audio file currently being processed (if it's
            # not the start of the script)
            if audiofile != '':
                # Extract the segment, using as range [start_time - 0.5s:end_time + 0.5s] to make sure
                # speech is not being cut off
                segment = audio[start_time * 1000 - 500 if start_time * 1000 - 500 > 0 else start_time * 1000:\
                                end_time * 1000 + 500 if end_time * 1000 + 500 < len(audio) else end_time * 1000]
                # Save the extracted segment as a WAV file
                segment.export(out_path + audiofile + '_' + str(seg_num) + '.wav', format='wav')
                # Save the STM of the extracted segment
                f.close()
            # Update the file name of the currently processed audio file
            audiofile = segments[0]
            # Load the audio to be segmented
            audio = AudioSegment.from_file(audio_path + audiofile + '.wav')
            # Update the end time
            end_time = float(segments[4])
            # Update the start time
            start_time = float(segments[3])
            # Reset the segment number since new file is processed
            seg_num = 0
            # Open a new STM file for the first segment of the new file
            f = open(stm_path + audiofile + '_' + str(seg_num) + '.stm', "w+")
        
        # start_time keeps track of the start time of the segment
        # next_start_time is used to determine if the difference between the end_time
        # of the current utterance and the start time of the next utterance is large
        # enough to segment the file
        next_start_time = float(segments[3])
        
        # If the difference between current utterance's end time and next utterance's
        # start time is larger than 5 seconds, we segment as the difference is significant
        # enough to impact the performance of the ASR model
        if next_start_time - end_time > 5:
            # Same as before, we extract the segment
            segment = audio[start_time * 1000 - 500 if start_time * 1000 - 500 > 0 else start_time * 1000:\
                            end_time * 1000 + 500 if end_time * 1000 + 500 < len(audio) else end_time * 1000]
            # Save the segment
            segment.export(out_path + audiofile + '_' + str(seg_num) + '.wav', format='wav')
            # Increase the segment number for the next segment to be saved
            seg_num += 1
            # Set the new start time of the next segment
            start_time = next_start_time
            # Close the STM file of the saved segment
            f.close()
            # Open the STM file for the next segment
            f = open(stm_path + audiofile + '_' + str(seg_num) + '.stm', "w+")
        # Use the end time of the next segment to save the utterance in the new segment's
        # STM file
        end_time = float(segments[4])
        f.write(segments[0] + '_' + str(seg_num) + ' ' + ' '.join(segments[1:3]) + \
                ' ' + str(float(segments[3]) - start_time + 0.5) + ' ' + \
                str(end_time - start_time + 0.5) + ' ' + ' '.join(segments[5:]))
    
    # Once the loop is done, the last segment of the last audio file won't be saved
    # This will ensure that the last segment is also saved
    segment = audio[start_time * 1000 - 500 if start_time * 1000 - 500 > 0 else start_time * 1000:\
                    end_time * 1000 + 500 if end_time * 1000 + 500 < len(audio) else end_time * 1000]
    segment.export(out_path + audiofile + '_' + str(seg_num) + '.wav', format='wav')
    f.close()

In [None]:
# N-Best segmentation script - Dutch Conversational Telephone Speech (CTS-NL) subset
from pydub import AudioSegment

# Paths to the audio files directory, the segmented audio files output directory,
# and the segmented STM files output directory
# !!! MAKE SURE TO CHANGE THESE IF YOU INTEND TO RUN THE CODE !!!
audio_path = '/home/jovyan/evaluation/nbest/cts_nl/'
out_path = '/home/jovyan/evaluation/nbest/cts_nl_segments/'
stm_path = '/home/jovyan/evaluation/nbest/cts_nl_seg_stm/'

# Open the STM file which contains all the utterances of the cts-nl part of N-Best
# !!! MAKE SURE TO CHANGE THIS TO THE PATH OF THE STM FILE !!!
with open('/home/jovyan/evaluation/nbest/nbest-eval-2008-cts-nl.stm') as file:
    lines = file.readlines()
    audiofile = ''
    # This time we also keep track of the channel (2-channel audio, 1 speaker per channel)
    current_ch = '0'
    start_time = 0.0
    end_time = 0.0
    audio = ''
    seg_num = 0
    f = 'no file yet'
    # Array of 2 elements: the audio arrays of each channel
    channels = ''

    for line in lines:
        segments = line.split(' ')
        # If channel or audio file change
        if current_ch != segments[1] or segments[0] != audiofile:
            if audiofile != '':
                # Update the current channel index (ch_index follows standard CS practice of
                # starting from 0, therefore we have to subtract 1 since in the STM file it
                # starts from 1)
                ch_index = int(current_ch) - 1
                # Load the audio of the new channel index
                aud = channels[ch_index]
                segment = aud[start_time * 1000 - 500 if start_time * 1000 - 500 > 0 else start_time * 1000:\
                                end_time * 1000 + 500 if end_time * 1000 + 500 < len(aud) else end_time * 1000]
                # Save the previous segment
                # Filename format is:
                #
                # audiofilename-channel_segment.wav
                #
                # Name of WAV file will match the name of STM file
                segment.export(out_path + audiofile + '-' + current_ch + '_' + str(seg_num) + '.wav', format='wav')
                # Close the previous segment's STM file
                f.close()
            # If the audio file name changes, load the new audio
            if segments[0] != audiofile:
                audio = AudioSegment.from_file(audio_path + segments[0] + '.wav')
                channels = audio.split_to_mono()
            audiofile = segments[0]
            end_time = float(segments[4])
            start_time = float(segments[3])
            seg_num = 0
            current_ch = segments[1]
            f = open(stm_path + audiofile + '-' + segments[1] + '_' + str(seg_num) + '.stm', "w+")
        
        next_start_time = float(segments[3])
        
        if next_start_time - end_time > 5:
            ch_index = int(current_ch) - 1
            aud = channels[ch_index]
            segment = aud[start_time * 1000 - 500 if start_time * 1000 - 500 > 0 else start_time * 1000:\
                            end_time * 1000 + 500 if end_time * 1000 + 500 < len(aud) else end_time * 1000]
            segment.export(out_path + audiofile + '-' + current_ch + '_' + str(seg_num) + '.wav', format='wav')
            seg_num += 1
            start_time = next_start_time
            f.close()
            f = open(stm_path + audiofile + '-' + segments[1] + '_' + str(seg_num) + '.stm', "w+")
        end_time = float(segments[4])
        # The new STM will no longer have 2 channels. Instead, there will be only one channel, channel 1
        # This is because it is easier to process the channels separately (and Whisper cannot accept stereo audio)
        # Plans for the future would include merging the 2 channels into one and evaluating on that audio
        # Issue there is with the segmentation of the speech because the segments are written for the
        # current channel, then the segments of the next channel follow, so they need to be merged together
        # and aligned properly, which is not that practical because some segments overlap with each other.
        f.write(segments[0] + '-' + segments[1] + '_' + str(seg_num) + ' 1 ' + segments[2] + ' ' + str(float(segments[3]) - start_time + 0.5) + ' ' + str(end_time - start_time + 0.5) + ' ' + ' '.join(segments[5:]))
    
    ch_index = int(current_ch) - 1
    aud = channels[ch_index]
    segment = aud[start_time * 1000 - 500 if start_time * 1000 - 500 > 0 else start_time * 1000:\
                    end_time * 1000 + 500 if end_time * 1000 + 500 < len(aud) else end_time * 1000]
    f.close()