# Transcribe audio using a locally-running HuggingFace Model
3/24/2025, Dave Sisk, https://github.com/davidcsisk, https://www.linkedin.com/in/davesisk-doctordatabase/

This model is suitable for desktop or mobile device use.

In [5]:
# Do the necessary prep work...
#! pip install vosk
# Need to download and extract Vosk model into ./model directory
# https://alphacephei.com/vosk/models
# The model director should be in the current working directory,
# and look like something like this:
# model/
# ├── am/                # Acoustic model files
# ├── conf/              # Configuration files
# ├── graph/             # Graph files for decoding
# ├── ivector/           # Files for speaker identification (optional)
# ├── rescore/           # Rescoring files (optional)
# ├── README             # Information about the model
# └── other files        # Additional files required for the model

#! pip install wave
#! pip install pyaudio
#! pip install pydub

# Must download and install ffmpeg
# https://ffmpeg.org/download.html
# check it's accessible in the path:  ffmpeg -version


In [4]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer

# Load the Vosk model
model = Model("model")  # Ensure the Vosk model is downloaded and placed in the "model" directory

# Directory containing MP3 files
audio_directory = "."

# Function to convert MP3 to WAV with required specifications
def convert_mp3_to_wav(mp3_file, wav_file):
    from pydub import AudioSegment
    audio = AudioSegment.from_mp3(mp3_file)
    # Convert to mono, 16-bit, and 16kHz
    audio = audio.set_channels(1).set_sample_width(2).set_frame_rate(16000)
    audio.export(wav_file, format="wav")

# Iterate through all MP3 files in the directory
for file_name in os.listdir(audio_directory):
    if file_name.endswith(".mp3"):
        print(f"Processing {file_name}...")
        wav_file = f"{os.path.splitext(file_name)[0]}.wav"
        
        # Convert MP3 to WAV
        convert_mp3_to_wav(file_name, wav_file)
        
        # Open the WAV file
        with wave.open(wav_file, "rb") as wf:
            if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() not in [8000, 16000]:
                print(f"Skipping {file_name}: WAV file must be mono, 16-bit, and 8kHz or 16kHz.")
                continue
            
            rec = KaldiRecognizer(model, wf.getframerate())
            transcription = ""
            
            # Perform transcription
            while True:
                data = wf.readframes(4000)
                if len(data) == 0:
                    break
                if rec.AcceptWaveform(data):
                    result = json.loads(rec.Result())
                    transcription += result.get("text", "") + " "
            
            # Finalize transcription
            final_result = json.loads(rec.FinalResult())
            transcription += final_result.get("text", "")
            
            # Save the transcription to a text file
            output_file = f"{os.path.splitext(file_name)[0]}.txt"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(transcription.strip())
            print(f"Transcription saved to {output_file}") #           C:\ffmpeg\bin

Processing popularhistoryoftheartofmusic_00_mathews.mp3...
Transcription saved to popularhistoryoftheartofmusic_00_mathews.txt
Processing popularhistoryoftheartofmusic_01_mathews.mp3...
Transcription saved to popularhistoryoftheartofmusic_00_mathews.txt
Processing popularhistoryoftheartofmusic_01_mathews.mp3...
Transcription saved to popularhistoryoftheartofmusic_01_mathews.txt
Processing popularhistoryoftheartofmusic_02_mathews.mp3...
Transcription saved to popularhistoryoftheartofmusic_01_mathews.txt
Processing popularhistoryoftheartofmusic_02_mathews.mp3...
Transcription saved to popularhistoryoftheartofmusic_02_mathews.txt
Processing popularhistoryoftheartofmusic_03_mathews_reduced-sample-rate_16KHz.mp3...
Transcription saved to popularhistoryoftheartofmusic_02_mathews.txt
Processing popularhistoryoftheartofmusic_03_mathews_reduced-sample-rate_16KHz.mp3...
Transcription saved to popularhistoryoftheartofmusic_03_mathews_reduced-sample-rate_16KHz.txt
Transcription saved to popularhis