<a href="https://colab.research.google.com/github/darinkist/medium-article-vosk/blob/main/Transcribe_large_audio_files_offline_with_Vosk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install wget pydub wave tqdm vosk
!apt-get ffmpeg

# Download Vosk model
!mkdir models
!wget -P models/ https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip
!unzip models/vosk-model-en-us-0.22.zip -d models/ && rm models/vosk-model-en-us-0.22.zip

# 1. Convert mp3 to wave format

In [None]:
from pydub import AudioSegment
import os

def mp3_to_wav(source, skip=0, excerpt=False):
    
    sound = AudioSegment.from_mp3(source) # load source
    sound = sound.set_channels(1) # mono
    sound = sound.set_frame_rate(16000) # 16000Hz
    
    if excerpt:
        excrept = sound[skip*1000:skip*1000+30000] # 30 seconds - Does not work anymore when using skip
        output_path = os.path.splitext(source)[0]+"_excerpt.wav"
        excrept.export(output_path, format="wav")
    else:
        audio = sound[skip*1000:]
        output_path = os.path.splitext(source)[0]+".wav"
        audio.export(output_path, format="wav")
    
    return output_path

In [None]:
wave_file = mp3_to_wav('/content/opto_sessions_ep_69.mp3',37,True) # please specify here the path to your mp3 file

# 2. Transcribe audio

In [None]:
from vosk import Model, KaldiRecognizer, SetLogLevel
from tqdm.notebook import tqdm
import wave
import os
import json

def transcript_file(input_file, model_path):
    
    # Check if file exists
    if not os.path.isfile(input_file):
        raise FileNotFoundError(os.path.basename(input_file) + " not found")    
    
    # Check if model path exists
    if not os.path.exists(model_path):
        raise FileNotFoundError(os.path.basename(model_path) + " not found")

    # open audio file
    wf = wave.open(input_file, "rb")
    
    # check if wave file has the right properties
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        raise TypeError("Audio file must be WAV format mono PCM.")
    
    # Initialize model
    model = Model(model_path)
    rec = KaldiRecognizer(model, wf.getframerate())
    
    # Get file size (to calculate progress bar)
    file_size = os.path.getsize(input_file)
    
    # Run transcription
    pbar = tqdm(total=file_size)

    # To store our results
    transcription = []

    while True:
        data = wf.readframes(4000) # use buffer of 4000
        pbar.update(len(data))
        if len(data) == 0:
            pbar.set_description("Transcription finished")
            break
        if rec.AcceptWaveform(data):
            # Convert json output to dict
            result_dict = json.loads(rec.Result())
            # Extract text values and append them to transcription list
            transcription.append(result_dict.get("text", ""))

    # Get final bits of audio and flush the pipeline
    final_result = json.loads(rec.FinalResult())
    transcription.append(final_result.get("text", ""))
    
    transcription_text = ' '.join(transcription)
    
    return transcription_text

In [None]:
transcription = transcript_file(wave_file, '/content/models/vosk-model-en-us-0.22')

  0%|          | 0/960044 [00:00<?, ?it/s]

In [None]:
transcription

"to success on today's show i'm delighted to introduce beth kindle a technology analyst with over a decade of experience in the private markets she's now the co-founder of io fund which specializes in helping individuals gain a competitive advantage when investing in tech growth stocks how does beth do this well she's gained hands on experience over the years whilst either working for and or analyzing a huge amount of relevant tech companies in silicon valley they're involved in the market"