Realtime code


In [1]:
import pyaudio
import numpy as np
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification

# 加载微调的音频分类模型和处理器
model_path = "/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(model_path)
model.eval()

# 音频参数
sample_rate = 16000  # 模型需要的采核率
chunk_size = 16000   # 每次捕获 1 秒的音频（16000 个样本）
silence_threshold = 0.01  # 静音检测阀值

# 初始化 PyAudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=sample_rate,
                input=True,
                frames_per_buffer=chunk_size)

def classify_audio(audio_array):
    """
    使用微调的分类模型检测音频
    Args:
        audio_array (np.ndarray): 音频数据（单声道，16kHz）
    Returns:
        str: 分类结果的标签
    """
    # 预处理音频
    inputs = processor(audio_array, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return model.config.id2label[predicted_label]

def is_silent(audio_array):
    """
    判断音频是否为静音
    Args:
        audio_array (np.ndarray): 音频数据
    Returns:
        bool: 是否为静音
    """
    return np.max(np.abs(audio_array)) < silence_threshold

print("Listening for wake word 'Hey ZZX'...")

try:
    while True:
        # 捕获音频数据
        raw_audio = stream.read(chunk_size, exception_on_overflow=False)
        audio_array = np.frombuffer(raw_audio, dtype=np.int16).astype(np.float32) / 32768.0

        # 跳过静音片段
        if is_silent(audio_array):
            print("Silent audio detected. Skipping.")
            continue

        # 使用分类模型检测音频
        predicted_label = classify_audio(audio_array)
        print(f"Detected label: {predicted_label}")

        # 检查是否为唤醒词
        if predicted_label.lower() == "hey zzx":
            print("Wake word 'Hey ZZX' detected!")
            print("Welcome, ZZX, what can I do for you?")
            break

except KeyboardInterrupt:
    print("Stopping...")

finally:
    # 关闭音频流
    stream.stop_stream()
    stream.close()
    p.terminate()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Listening for wake word 'Hey ZZX'...
Silent audio detected. Skipping.
Silent audio detected. Skipping.
Detected label: Hey ZZX
Wake word 'Hey ZZX' detected!
Welcome, ZZX, what can I do for you?
