In [None]:
# Installations
!pip install SpeechRecognition pydub

# Import Libraries
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import speech_recognition as sr

NOTE:

We are using SpeechRecognition, a python library and a Google
Speech Recognition API for transcriing the audio.
We use Specifically the Recognizer() function to listen to a given audio file given from our source.


Secondly, we are using Pydub to Manipulate the audio file with an simple and easy high level interface. It is also a python Library and open source.

The two libraries together will give us a convinent ability to work on this project for efficient speech-to-text recognition

In [None]:
# Define function to transcribe audio using Google Speech Recognition API
def transcribe_audio(path):
    """Recognize speech in the audio file using Google Speech Recognition."""
    # Create a speech recognition object
    r = sr.Recognizer()

    # Use the audio file as the audio source
    with sr.AudioFile(path) as source:
        audio_listened = r.record(source)
        # Try converting it to text
        text = r.recognize_google(audio_listened)
    return text

# Function to split audio into chunks based on silence and transcribe them
def get_large_audio_transcription_on_silence(path):
    """Splitting the large audio file into chunks and apply speech recognition on each chunk."""
    # Open the audio file using pydub
    sound = AudioSegment.from_file(path)

    # Split audio sound where silence is 500 milliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # Experiment with this value for your target audio file
        min_silence_len=500,
        # Adjust this per requirement
        silence_thresh=sound.dBFS - 14,
        # Keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )

    # Create a directory(folder) to store the audio chunks
    folder_name = "audio-chunks"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)

    whole_text = ""
    # Process each chunk
    for i, audio_chunk in enumerate(chunks, start=1):
        # Export audio chunk and save it in the `folder_name` directory
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")

        # Recognize the chunk
        try:
            text = transcribe_audio(chunk_filename)
        except sr.UnknownValueError as e:
            print("Error:", str(e))
        else:
            text = f"{text.capitalize()}. "
            print(chunk_filename, ":", text)
            whole_text += text

    # Return the text for all chunks detected
    return whole_text

In [None]:
# usage
path = "./audio.wav"
print("\nFull text:", get_large_audio_transcription_on_silence(path))


audio-chunks/chunk1.wav : Here's a bird which he had fixed in a bowery or a country seat. 
audio-chunks/chunk2.wav : Add a short distance from the city. 
audio-chunks/chunk3.wav : Just that what is now called dutch street. 
audio-chunks/chunk4.wav : Soon abounded with proofs of his ingenuity. 
audio-chunks/chunk5.wav : Patent smoke. 
audio-chunks/chunk6.wav : It required a horse to work some. 
audio-chunks/chunk7.wav : Dutch ovens that roasted meat without fire. 
audio-chunks/chunk8.wav : Carts that went before the horses. 
audio-chunks/chunk9.wav : Weather cox that turned against the wind and other wrong-headed contrivances. 
audio-chunks/chunk10.wav : Set astonished and confounded all beholders. 

Full text: Here's a bird which he had fixed in a bowery or a country seat. Add a short distance from the city. Just that what is now called dutch street. Soon abounded with proofs of his ingenuity. Patent smoke. It required a horse to work some. Dutch ovens that roasted meat without fire. C