In [2]:
import torch
import numpy as np
import sounddevice as sd
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load pre-trained model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Parameters for audio capture
sampling_rate = 16000
chunk_duration = 1  # duration of each audio chunk in seconds
buffer_size = sampling_rate * chunk_duration

# Buffer to store audio data
audio_buffer = np.zeros(buffer_size, dtype=np.float32)

def audio_callback(indata, frames, time, status):
    global audio_buffer
    audio_buffer = np.roll(audio_buffer, -frames)
    audio_buffer[-frames:] = indata[:, 0]

def transcribe_audio(buffer):
    input_values = processor(buffer, return_tensors="pt", sampling_rate=sampling_rate).input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

def main():
    with sd.InputStream(samplerate=sampling_rate, channels=1, callback=audio_callback):
        print("Recording... Press Ctrl+C to stop.")
        try:
            while True:
                transcription = transcribe_audio(audio_buffer)
                print(f"Transcription: {transcription}", end="\r")
        except KeyboardInterrupt:
            print("Stopped recording.")

if __name__ == "__main__":
    main()
