Training
In this code, we fine-tune the MIT speech commands V2, add "Hey ZZX" to the model, after that we can use this new model to recognize our new wakeup words
Procedure:
- Generate the new voice dataset ( at least 30 samples in different voice), you can use text to speech tools to generate
  - https://www.narakeet.com/languages/chinese-text-to-speech
  - https://micmonster.com/text-to-speech/chinese-mandarin-simplified/
- Covert the voice to required type (wav file, 16kHz voice) - use code
- Split it to training dataset(80%) and test dataset(20%), if you have more data , leave some validation dataset  - use code
- generate two csv file(train and test), with their links
- Start to Train , save the model to local PC
- Realtime test 

Notice: Skip generate voice, covert type, split to csv
Start from training

In [None]:
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForAudioClassification, TrainingArguments, Trainer
import numpy as np
import torch

# 加载数据集
data_files = {
    "train": "/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2/train.csv",
    "test": "/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2/test.csv",
}
dataset = load_dataset("csv", data_files=data_files)
# 转换标签为字符串
def convert_label_to_str(batch):
    batch["label"] = str(batch["label"])
    return batch

dataset = dataset.map(convert_label_to_str, num_proc=1) 

# 加载分类模型和处理器
model = AutoModelForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-speech-commands-v2",
    num_labels=2,  # 修改为你的分类数量
    ignore_mismatched_sizes=True  # 忽略尺寸不匹配的错误
)
processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-speech-commands-v2")

# 数据预处理函数
def preprocess_audio(batch):
    try:
        audio_array = batch["path"]["array"]
        inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
        batch["input_values"] = inputs.input_values[0].numpy()
    except Exception as e:
        print(f"Error processing {batch}: {e}")
        batch["input_values"] = None
    return batch

dataset = dataset.cast_column("path", Audio(sampling_rate=16000))
dataset = dataset.map(preprocess_audio, remove_columns=["path", "text"], num_proc=1)

# 自定义数据整理器
def data_collator(features):
    input_values = torch.tensor([f["input_values"] for f in features], dtype=torch.float32)
    labels = torch.tensor([int(f["label"]) for f in features], dtype=torch.long)  # 确保是整数类型
    return {"input_values": input_values, "labels": labels}

# 设置训练参数
training_args = TrainingArguments(
    output_dir="./audio-classification-hey-zzx",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    save_steps=500,
    logging_steps=100,
    report_to="none",
)

# 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor,
    data_collator=data_collator,
)

# 开始训练
trainer.train()

model.config.id2label = {0: "Not Hey ZZX", 1: "Hey ZZX"}
model.config.label2id = {"Not Hey ZZX": 0, "Hey ZZX": 1}


model.save_pretrained("/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2")
processor.save_pretrained("/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2")

Please be aware of several things:
1- training data should be exactly correct, please print debug info for the data in details
2- save the model properly ( you can change the path)

Test1 - specific wav file test

In [None]:
from transformers import pipeline

# 加载分类器
classifier = pipeline("audio-classification", model="/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2")

# 测试音频路径
audio_path = "/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx/voicedataset/converted_wav/test9.wav"

# 使用分类器进行预测
prediction = classifier(audio_path)
print("Prediction:", prediction)

Test2 - realtime test

In [1]:
import pyaudio
import numpy as np
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification

# 加载微调的音频分类模型和处理器
model_path = "/Users/7one/Documents/Work/mangoesai/livekit_paddle/heyzzx2"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(model_path)
model.eval()

# 音频参数
sample_rate = 16000  # 模型需要的采核率
chunk_size = 16000   # 每次捕获 1 秒的音频（16000 个样本）
silence_threshold = 0.01  # 静音检测阀值

# 初始化 PyAudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=sample_rate,
                input=True,
                frames_per_buffer=chunk_size)

def classify_audio(audio_array):
    """
    使用微调的分类模型检测音频
    Args:
        audio_array (np.ndarray): 音频数据（单声道，16kHz）
    Returns:
        str: 分类结果的标签
    """
    # 预处理音频
    inputs = processor(audio_array, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return model.config.id2label[predicted_label]

def is_silent(audio_array):
    """
    判断音频是否为静音
    Args:
        audio_array (np.ndarray): 音频数据
    Returns:
        bool: 是否为静音
    """
    return np.max(np.abs(audio_array)) < silence_threshold

print("Listening for wake word 'Hey ZZX'...")

try:
    while True:
        # 捕获音频数据
        raw_audio = stream.read(chunk_size, exception_on_overflow=False)
        audio_array = np.frombuffer(raw_audio, dtype=np.int16).astype(np.float32) / 32768.0

        # 跳过静音片段
        if is_silent(audio_array):
            print("Silent audio detected. Skipping.")
            continue

        # 使用分类模型检测音频
        predicted_label = classify_audio(audio_array)
        print(f"Detected label: {predicted_label}")

        # 检查是否为唤醒词
        if predicted_label.lower() == "hey zzx":
            print("Wake word 'Hey ZZX' detected!")
            print("Welcome, ZZX, what can I do for you?")
            break

except KeyboardInterrupt:
    print("Stopping...")

finally:
    # 关闭音频流
    stream.stop_stream()
    stream.close()
    p.terminate()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Listening for wake word 'Hey ZZX'...
Detected label: Hey ZZX
Wake word 'Hey ZZX' detected!
Welcome, ZZX, what can I do for you?
