In [None]:
%pip install -U openai-whisper sounddevice pyaudio numpy

In [None]:
import asyncio
import nest_asyncio
import whisper
import numpy as np
from sounddevice import RawInputStream

nest_asyncio.apply()

In [None]:
class MyWhisperHandler:
    def __init__(self, model="base"):
        self.model = whisper.load_model(model)
        # self.fp = cuda.is_available()
        self.options = whisper.DecodingOptions(fp16=False)

    async def handle_transcript_event(self, transcript_event, lang):
        # print(transcript_event['text'])
        print(f"{lang} - {transcript_event.text}")

In [None]:
async def mic_stream():
    loop = asyncio.get_event_loop()
    input_queue = asyncio.Queue() 

    def callback(indata, frame_count, time_info, status):
        # loop.call_soon_threadsafe(input_queue.put_nowait, (frombuffer(indata, dtype=int16), status))
        loop.call_soon_threadsafe(input_queue.put_nowait, (np.frombuffer(indata, dtype=np.int16).astype(np.float32), status))


    stream = RawInputStream(
        channels=1,
        samplerate=16000,
        callback=callback,
        blocksize=1024 * 2,
        dtype="int16",
    )

    with stream:
        while True:
            indata, status = await input_queue.get()
            yield indata, status

In [None]:
async def write_chunks(handler):
    async for chunk, status in mic_stream():
        chunk = whisper.pad_or_trim(chunk)
        mel = whisper.log_mel_spectrogram(chunk).to(handler.model.device)
        _, probs = handler.model.detect_language(mel)
        lang = {max(probs, key=probs.get)}
        transcript_event = whisper.decode(handler.model, mel, handler.options)
        await handler.handle_transcript_event(transcript_event, lang)

In [None]:
async def basic_transcribe():
   handler = MyWhisperHandler()
   await asyncio.gather(write_chunks(handler))

In [None]:
asyncio.run(basic_transcribe())