Modification of whisper to text.ipynb to handle bulk transcriptions of an entire directory. This uses the CPU based whisper.cpp model that is slower by 14-60x but has the highest quality output with punctuation and acronyms most often correct such that very little if any editing is required. 

Added a file renaming function to remove spaces from any audio filename (as ffmpeg will cut off after the space).

CAUTION: MAKE SURE TO BACKUP AUDIO FILES IF THEY HAVE SPACES IN THEIR NAMES AS THEY WILL BE RENAMED (AND THE METADATA ALTERED). For file names without spaces, the original file is not renamed and a copy is made in a compatible audio format.

In [None]:
# https://github.com/ggerganov/whisper.cpp
# conda activate py310
import subprocess
import os
import glob
import shutil

In [None]:
# set podcasts=True to copy all downloaded podcasts to the whisper.cpp/samples directory
# example file location is from great Gnome Podcast app:  https://apps.gnome.org/en-GB/Podcasts/ 
podcasts = True

In [None]:
# rename all audio files with spaces in their name
# poe.com assisted code
# Specify the directory where the files are located: ensure there are no whitespaces in directory 
# e.g., '/bad directory/' will not work: it must be '/bad_directory/' or '/bad-directory/', etc.
home_directory = os.path.expanduser("~")
directory = home_directory + '/machine_learning/whisper.cpp/samples/'
#directory = home_directory + '/Music/Voice_Memos/'

if podcasts:
    # copy podcast from subdirectories with spaces to whisper sample directory
    source_directory = home_directory + '/.var/app/org.gnome.Podcasts/data/gnome-podcasts/Downloads/'
    # Traverse the source directory and its subdirectories
    for root, directories, files in os.walk(source_directory):
        for file in files:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(directory, file)
            # Copy the file to the destination directory
            shutil.copy2(source_path, destination_path)

# Get a list of all audio files, .m4a, .mp3, and .wav files, in the directory
# added *.m2a for podcasts
files = glob.glob(os.path.join(directory, '*.m4a')) + \
        glob.glob(os.path.join(directory, '*.mp3')) + \
        glob.glob(os.path.join(directory, '*.m2a')) + \
        glob.glob(os.path.join(directory, '*.ogg')) + \
        glob.glob(os.path.join(directory, '*.wav'))

# Iterate over the files (use this approach also for directory transcription)
# CAUTION: this overwrites files with spaces in their names such as 'Track 11.wav' to 'Track-11.wav' 
# by overwriting the file, it permits processing but alters the metadata to date file saved=today
for file in files:
    # If the file name contains a space
    if ' ' in file:
        # Replace the spaces with hyphens
        new_name = file.replace(' ', '-')
        # Rename the file
        os.rename(file, new_name)

In [None]:
# iterate over all audio files and transcribe them:
# note that due to the renaming function above, directory information
# is contained in the file variable
for file in files:
    audio_file = file
    # convert audio file to 16-bit wav format required by whisper
    output_file = audio_file + '-output.wav'
    print(audio_file)
    print(output_file)

    # convert audio_file then transcribe to text
    # overwrites existing file with same name with yes_command
    try:
        yes_command = f'echo "y" | '
        subprocess.run([yes_command + 'ffmpeg' + ' -i ' +  audio_file + ' -ar 16000 -ac 1 -c:a pcm_s16le ' 
                        + output_file], shell=True, check=True)
        print("Audio coverted successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Audio convertion failed with error {e.returncode}.")

    # transcribe using the large quantized CPU model, output text file
    try:
        subprocess.run(['transcribe -t 24 -m ' + home_directory + '/machine_learning/whisper.cpp/models/ggml-model-whisper-large-q5_0.bin -f ' 
                        + output_file + ' -otxt'], shell=True, check=True)
        print("Transcription executed successfully and saved in " + output_file)
    except subprocess.CalledProcessError as e:
        print(f"Transcription failed with error {e.returncode}.")

In [None]:
# helper utility to combine all transcripts into one file (for ease of scanning podcasts)
# ai assisted
def combine_markdown_files(output_file_path, input_directory):
    markdown_files = glob.glob(input_directory + '*.txt')

    with open(output_file_path, 'w') as output_file:
        for file_path in markdown_files:
            file_name = os.path.basename(file_path)
            with open(file_path, 'r') as input_file:
                output_file.write(f"## {file_name}\n\n")
                output_file.write(input_file.read())
                output_file.write('\n\n')  # Add newline between files

    print(f"Combined {len(markdown_files)} text files into {output_file_path}")

# saves combined text files into directory of Jupyter notebook (not the input directory)
# CPU whisper saves to .txt format while distil-whisper saves to .md
combine_markdown_files('podcast_transcripts.txt', directory)