In [None]:
import queue
import re
import pyaudio
import asyncio
import torch 
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import numpy as np
import threading


# Audio settings
STEP_IN_SEC: int = 1 # We'll increase the processable audio data by this
LENGTH_IN_SEC: int = 6 #  MAximum time duration at which the audio data will pe processed together at once. Think of it as sliding window
NB_CHANNELS = 1 # Input channels. For mic, it's usually 1
SAMPLE_RATE = 16000 # Per second Sampling Rate
CHUNK = SAMPLE_RATE

# Queues
audio_queue = queue.Queue()
length_queue = queue.Queue(maxsize=LENGTH_IN_SEC)

# Whisper settings
LANGUAGE = "english"
TRANSCRIPTION_MODEL_NAME = "openai/whisper-large-v3-turbo"

# Visualization
MAX_SENTENCE_CHARACTERS = 128

# Devices, dtypes
device_name = torch.device("cuda") if torch.cuda.is_available() else ("mps" if torch.mps.is_available() else "cpu")
device = torch.device(device_name)
torch_dtype = torch.bfloat16

# --------------------- Transcription Pipeline -------------------------------

transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    TRANSCRIPTION_MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True
)
transcription_model.to(device)

processor = AutoProcessor.from_pretrained(TRANSCRIPTION_MODEL_NAME)

transcription_pipeline = pipeline(
    "automatic-speech-recognition",
    model=transcription_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s = LENGTH_IN_SEC,
    batch_size = 1,  # batch size for inference - set based on your device
    torch_dtype=torch_dtype,
    device=device,
)

# Global flag to control transcription
is_transcribing = True

def producer():
    global is_transcribing
    audio = pyaudio.PyAudio()
    stream = audio.open(
        format=pyaudio.paInt16,
        channels=NB_CHANNELS,
        rate=SAMPLE_RATE,
        input=True,
        frames_per_buffer=CHUNK,
    )

    while is_transcribing:
        audio_data = b""
        for _ in range(STEP_IN_SEC):
            chunk = stream.read(SAMPLE_RATE)
            audio_data += chunk

        audio_queue.put(audio_data)

    stream.stop_stream()
    stream.close()
    audio.terminate()

def consumer():
    global is_transcribing
    while is_transcribing:
        if length_queue.qsize() >= LENGTH_IN_SEC:
            length_queue._queue.clear()

        audio_data = audio_queue.get()
        length_queue.put(audio_data)

        audio_data_to_process = b""
        for i in range(length_queue.qsize()):
            audio_data_to_process += length_queue._queue[i]

        audio_data_array = np.frombuffer(audio_data_to_process, np.int16).astype(np.float32) / 255.0

        transcription_out = transcription_pipeline({"array":audio_data_array, "sampling_rate":SAMPLE_RATE}, 
                                             return_timestamps=True, 
                                             generate_kwargs={"language": "english", "return_timestamps": True, "max_new_tokens": MAX_SENTENCE_CHARACTERS})

        # yield f"data: {transcription_out["text"]}\n\n"
        print(transcription_out["text"], end='\r', flush=True)
        audio_queue.task_done()


producer = threading.Thread(target=producer)
producer.start()

consumer = threading.Thread(target=consumer)
consumer.start()

try:
    producer.join()
    consumer.join()
except KeyboardInterrupt:
    print("Exiting...")

Exception in thread Thread-5 (consumer):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/cw/t5b5dh315pb4f2w38_466zs80000gn/T/ipykernel_50748/1122364220.py", line 91, in consumer
AttributeError: 'Queue' object has no attribute '_queue'. Did you mean: 'queue'?


In [1]:
from datasets import load_dataset

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

In [8]:
# import torch
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# from datasets import load_dataset


# device = torch.device("mps")
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# model_id = "openai/whisper-large-v3-turbo"

# model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )
# model.to(device)

# processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
#    return_timestamps = True
)

# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
# sample = dataset[0]["audio"]
# del sample["path"]

# result = pipe(sample)
# print(result["text"])




ValueError: You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features.

In [None]:
pipeline({"array":sample["array"], }, 
                                             return_timestamps=True, 
                                             generate_kwargs={"language": "english", "return_timestamps": True, 
                                                              "max_new_tokens": 128})