Modification of whisper-distil.ipynb to handle bulk transcriptions of an entire directory. Added a file renaming function to remove spaces from any audio filename (as ffmpeg will cut off after the space).

In [None]:
import torch
torch.__version__

In [None]:
# need cuda for vastly faster transcription
#47 seconds for a full 30 min podcast! Insanely fast
torch.cuda.is_available()

In [None]:
# https://github.com/huggingface/distil-whisper
# conda activate py310
import torch
import subprocess
import os
import glob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device)

# biggest model for highest quality
model_id = "distil-whisper/distil-large-v2"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    use_safetensors=True,
    attn_implementation="flash_attention_2")        # updated
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
# rename all audio files with spaces in their name
# poe.com assisted code
# Specify the directory where the files are located
#directory = '/var/home/fraser/machine_learning/whisper.cpp/samples/'
directory = '/var/home/fraser/Music/Voice_Memos/'

# Get a list of all audio files, .m4a, .mp3, and .wav files, in the directory
files = glob.glob(os.path.join(directory, '*.m4a')) + \
        glob.glob(os.path.join(directory, '*.mp3')) + \
        glob.glob(os.path.join(directory, '*.ogg')) + \
        glob.glob(os.path.join(directory, '*.wav'))

# Iterate over the files (use this approach also for directory transcription)
for file in files:
    # If the file name contains a space
    if ' ' in file:
        # Replace the spaces with hyphens
        new_name = file.replace(' ', '-')
        # Rename the file
        os.rename(file, new_name)

In [None]:
# iterate over all audio files and transcribe them:
for file in files:
    audio_file = file
    # convert audio file to 16-bit wav format required by whisper
    output_file = audio_file + '-output.wav'
    print(audio_file)
    print(output_file)

    # convert audio_file then transcribe to text
    # overwrites existing file with same name with yes_command
    try:
        yes_command = f'echo "y" | '
        subprocess.run([yes_command + 'ffmpeg' + ' -i ' +  audio_file + ' -ar 16000 -ac 1 -c:a pcm_s16le ' 
                        + output_file], shell=True, check=True)
        print("Audio coverted successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Audio convertion failed with error {e.returncode}.")

    # pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=15,
        batch_size=16,
        torch_dtype=torch_dtype,
        device=device
    )
    result_local = pipe(output_file)

    # save transcript as a .md file
    saved_txt=result_local["text"]
    f = open(output_file + ".md", "a")
    f.write(saved_txt)
    f.close()
    print("saved:", f)
        

1.9s for the same file under whisper-distil with CUDA that took whisper 4m 44 seconds!
43s for a full podcast of over 30 minutes!