In [11]:
import moviepy.editor as mp
import speech_recognition as sr
from tqdm import tqdm

def extract_text_from_audio(file_path):
    # Load the video file
    clip = mp.VideoFileClip(file_path)
    
    # Extract the audio from the video
    audio = clip.audio
    
    # Save the audio as a temporary file
    audio_path = "temp_audio.wav"
    audio.write_audiofile(audio_path)
    
    # Initialize the recognizer
    r = sr.Recognizer()
    
    # Transcribe each chunk of audio
    chunk_duration = 30  # Chunk duration in seconds
    total_duration = clip.duration
    chunks = int(total_duration / chunk_duration) + 1
    
    text = ""
    
    # Use tqdm to create a loading bar
    with tqdm(total=chunks, desc="Processing", unit="chunk") as pbar:
        for i in range(chunks):
            start_time = i * chunk_duration
            end_time = min((i + 1) * chunk_duration, total_duration)
            
            with sr.AudioFile(audio_path) as source:
                audio = r.record(source, offset=start_time, duration=end_time - start_time)
                chunk_text = r.recognize_google(audio)
                text += chunk_text + " "
            
            pbar.update(1)  # Update the loading bar
            
    return text

# Specify the file path
file_path = "thoreau-walden.mp4"   #YOUR AUDIO FILE HERE

# Call the function to extract text from the audio
extracted_text = extract_text_from_audio(file_path)

# Save the extracted text to a file
output_file = "extracted_text.txt"
with open(output_file, "w") as file:
    file.write(extracted_text)

print("Text extracted and saved to:", output_file)

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.


Processing: 100%|█████████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.91s/chunk]

Text extracted and saved to: extracted_text.txt



