In [3]:
import whisper
import requests
import time
import wave
import io
from moviepy.editor import VideoFileClip
from google.cloud import speech
from jiwer import wer
from tabulate import tabulate

ASSEMBLYAI_API_KEY = 'your_assemblyai_api_key'  # Replace with your actual key

# Step 1: Extract audio from video
def extract_audio(video_path, audio_path='temp_audio.wav'):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path, codec='pcm_s16le')
    return audio_path

# Step 2a: Whisper transcription
def transcribe_whisper(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result['text']

# Step 2b: Google Speech-to-Text
def transcribe_google(audio_path):
    client = speech.SpeechClient()
    with io.open(audio_path, "rb") as f:
        content = f.read()
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    response = client.recognize(config=config, audio=audio)
    return " ".join([result.alternatives[0].transcript for result in response.results])

# Step 2c: AssemblyAI transcription
def upload_to_assemblyai(audio_path):
    headers = {'authorization': ASSEMBLYAI_API_KEY}
    with open(audio_path, 'rb') as f:
        response = requests.post('https://api.assemblyai.com/v2/upload', headers=headers, files={'file': f})
    return response.json()['upload_url']

def transcribe_assemblyai(audio_url):
    headers = {'authorization': ASSEMBLYAI_API_KEY, 'content-type': 'application/json'}
    response = requests.post('https://api.assemblyai.com/v2/transcript', headers=headers, json={"audio_url": audio_url})
    transcript_id = response.json()['id']

    while True:
        polling = requests.get(f'https://api.assemblyai.com/v2/transcript/{transcript_id}', headers=headers)
        status = polling.json()['status']
        if status == 'completed':
            return polling.json()['text']
        elif status == 'error':
            return "Error"
        time.sleep(2)

# Step 3: Compare all outputs
def compare_transcriptions(reference, *others):
    results = []
    for name, text in others:
        error = wer(reference, text)
        results.append([name, f"{(1 - error) * 100:.2f}%", f"{error * 100:.2f}%"])
    return results

# Step 4: Run all
def run_comparison(video_path):
    print("🎬 Extracting audio...")
    audio_path = extract_audio(video_path)

    print("🧠 Transcribing with Whisper...")
    whisper_text = transcribe_whisper(audio_path)

    print("☁️ Transcribing with Google STT...")
    google_text = transcribe_google(audio_path)

    print("🔊 Transcribing with AssemblyAI...")
    audio_url = upload_to_assemblyai(audio_path)
    assembly_text = transcribe_assemblyai(audio_url)

    # We'll treat Whisper output as ground truth (you can replace it with manual reference too)
    print("\n📊 Comparing outputs using Whisper as reference...\n")
    results = compare_transcriptions(
        whisper_text,
        ("Google STT", google_text),
        ("AssemblyAI", assembly_text)
    )

    print(tabulate(results, headers=["Model", "Accuracy", "WER"], tablefmt="fancy_grid"))

# Example usage
run_comparison("https://www.youtube.com/watch?v=kHVAk96r05Y")  # Replace with your video path


ModuleNotFoundError: No module named 'whisper'