# Train Custom "Claudinho" Wake Word Model

**Runtime: GPU** (Runtime > Change runtime type > T4 GPU) then **Runtime > Run all**

In [None]:
# === 1. SETUP ===
!pip install -q openwakeword onnxruntime numpy scipy torch
!git clone -q https://github.com/claudinhocoding/claudinho.git claudinho_repo
!wget -q https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy
!wget -q https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy

import openwakeword, os, glob
oww_dir = os.path.join(os.path.dirname(openwakeword.__file__), 'resources', 'models')
os.makedirs(oww_dir, exist_ok=True)
!wget -q https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx -O {oww_dir}/melspectrogram.onnx
!wget -q https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx -O {oww_dir}/embedding_model.onnx

samples = sorted(glob.glob('claudinho_repo/training/positive/claudinho/*.wav'))
print(f'Setup complete: {len(samples)} samples')

In [None]:
# === 2. DETECT MODEL SHAPES ===
import numpy as np
import scipy.io.wavfile
import scipy.signal
import onnxruntime as ort

mel_sess = ort.InferenceSession(os.path.join(oww_dir, 'melspectrogram.onnx'))
emb_sess = ort.InferenceSession(os.path.join(oww_dir, 'embedding_model.onnx'))

# Get actual input/output names
MEL_INPUT_NAME = mel_sess.get_inputs()[0].name
EMB_INPUT_NAME = emb_sess.get_inputs()[0].name
print(f'Mel input name: {MEL_INPUT_NAME}')
print(f'Emb input name: {EMB_INPUT_NAME}')

# Probe mel output shape
test_mel = mel_sess.run(None, {MEL_INPUT_NAME: np.zeros((1, 1280), dtype=np.float32)})[0]
print(f'Mel output shape per 80ms chunk: {test_mel.shape}')

# Probe embedding input shape
emb_input_info = emb_sess.get_inputs()[0]
emb_output_info = emb_sess.get_outputs()[0]
print(f'Embedding expects: {emb_input_info.shape}')
print(f'Embedding outputs: {emb_output_info.shape}')

# Extract the expected shape
EMB_INPUT_SHAPE = emb_input_info.shape  # e.g. [1, 76, 32]
N_MEL_FRAMES = EMB_INPUT_SHAPE[1]  # how many mel frames the embedding needs
MEL_DIM = EMB_INPUT_SHAPE[2]       # mel feature dimension

# How many mel frames does one 1280-sample chunk produce?
MEL_FRAMES_PER_CHUNK = test_mel.shape[0] if len(test_mel.shape) == 2 else test_mel.shape[1]
# If mel output is (1, N, D), flatten to (N, D)
test_mel_flat = test_mel.reshape(-1, test_mel.shape[-1]) if len(test_mel.shape) == 3 else test_mel
MELS_PER_CHUNK = test_mel_flat.shape[0]
ACTUAL_MEL_DIM = test_mel_flat.shape[1]

print(f'\nMel frames per 80ms chunk: {MELS_PER_CHUNK}')
print(f'Mel feature dim: {ACTUAL_MEL_DIM}')
print(f'Embedding needs {N_MEL_FRAMES} frames of dim {MEL_DIM}')

# Chunks needed for one embedding
CHUNKS_FOR_EMB = int(np.ceil(N_MEL_FRAMES / MELS_PER_CHUNK))
print(f'Audio chunks needed for one embedding: {CHUNKS_FOR_EMB} ({CHUNKS_FOR_EMB * 0.08:.2f}s)')

In [None]:
# === 3. FEATURE EXTRACTION ===
WINDOW = 1280  # 80ms at 16kHz

def audio_to_features(audio):
    """Extract embeddings from 16kHz audio."""
    audio = np.asarray(audio, dtype=np.float32)
    if np.abs(audio).max() > 1.5:
        audio = audio / 32768.0

    # Minimum audio length
    min_samples = CHUNKS_FOR_EMB * WINDOW
    if len(audio) < min_samples:
        pad = min_samples - len(audio)
        audio = np.pad(audio, (pad // 2, pad - pad // 2))

    # Get all mel frames
    all_mels = []
    for s in range(0, len(audio) - WINDOW + 1, WINDOW):
        chunk = audio[s:s+WINDOW].reshape(1, -1)
        m = mel_sess.run(None, {MEL_INPUT_NAME: chunk})[0]
        # Flatten to (n_frames, mel_dim)
        m_flat = m.reshape(-1, m.shape[-1])
        all_mels.append(m_flat)

    if not all_mels:
        return np.empty((0, 96))

    # Stack all mel frames
    mel_stack = np.concatenate(all_mels, axis=0)  # (total_frames, mel_dim)

    # Sliding window over mel frames to get embeddings
    feats = []
    step = max(1, MELS_PER_CHUNK)  # step by one chunk worth of frames
    for i in range(0, len(mel_stack) - N_MEL_FRAMES + 1, step):
        batch = mel_stack[i:i+N_MEL_FRAMES]  # (N_MEL_FRAMES, mel_dim)
        # Reshape to what embedding model expects
        batch = batch[:N_MEL_FRAMES, :MEL_DIM]  # trim if needed
        # Reshape to match model's expected rank (could be 3 or 4)
        expected_rank = len(EMB_INPUT_SHAPE)
        if expected_rank == 4:
            batch = batch.reshape(1, 1, N_MEL_FRAMES, MEL_DIM).astype(np.float32)
        else:
            batch = batch.reshape(1, N_MEL_FRAMES, MEL_DIM).astype(np.float32)
        emb = emb_sess.run(None, {EMB_INPUT_NAME: batch})[0]
        feats.append(emb.flatten())

    return np.array(feats) if feats else np.empty((0, 96))

# Quick test
sr, test_audio = scipy.io.wavfile.read(samples[0])
test_feats = audio_to_features(test_audio)
print(f'Test: {os.path.basename(samples[0])} -> {test_feats.shape[0]} embeddings of dim {test_feats.shape[1] if len(test_feats) > 0 else "?"}')

EMB_DIM = test_feats.shape[1] if len(test_feats) > 0 else 96
print(f'Embedding dimension: {EMB_DIM}')

In [None]:
# === 4. AUGMENT & EXTRACT ALL ===
def augment(audio):
    a = audio.astype(np.float32)
    versions = [a]
    for nl in [0.003, 0.008, 0.015, 0.025]:
        versions.append(a + np.random.randn(len(a)) * nl * 32768)
    for g in [0.4, 0.6, 0.8, 1.2, 1.5, 2.0]:
        versions.append(a * g)
    for sh in [1600, 3200, 6400]:
        versions.append(np.concatenate([np.zeros(sh), a]))
        versions.append(np.concatenate([a, np.zeros(sh)]))
    for rate in [0.85, 0.92, 1.08, 1.15]:
        n = int(len(a) / rate)
        versions.append(np.interp(np.linspace(0, len(a)-1, n), np.arange(len(a)), a))
    for nl, g in [(0.005, 0.7), (0.01, 1.3), (0.02, 0.5)]:
        versions.append((a + np.random.randn(len(a)) * nl * 32768) * g)
    return versions

print('Extracting features...')
all_feats = []
for i, wav in enumerate(samples):
    sr, audio = scipy.io.wavfile.read(wav)
    if sr != 16000:
        audio = scipy.signal.resample(audio, int(len(audio) * 16000 / sr)).astype(np.int16)
    for aug in augment(audio):
        f = audio_to_features(aug)
        if len(f) > 0:
            all_feats.append(f)
    if (i+1) % 10 == 0:
        print(f'  {i+1}/{len(samples)}...')

pos_features = np.concatenate(all_feats)
print(f'\nPositive: {pos_features.shape[0]} embeddings ({pos_features.shape[0]/len(samples):.0f}x multiplier)')

In [None]:
# === 5. LOAD NEGATIVES ===
neg_raw = np.load('openwakeword_features_ACAV100M_2000_hrs_16bit.npy')
neg_f = neg_raw.astype(np.float32)
if neg_raw.dtype == np.int16: neg_f /= 256.0

# Match embedding dimension
if neg_f.shape[1] != EMB_DIM:
    print(f'Warning: neg features dim {neg_f.shape[1]} != {EMB_DIM}, truncating/padding')
    if neg_f.shape[1] > EMB_DIM:
        neg_f = neg_f[:, :EMB_DIM]
    else:
        neg_f = np.pad(neg_f, ((0,0),(0, EMB_DIM - neg_f.shape[1])))

max_neg = min(len(neg_f), pos_features.shape[0] * 50)
neg_f = neg_f[np.random.choice(len(neg_f), max_neg, replace=False)]

val_raw = np.load('validation_set_features.npy')
val_neg = val_raw.astype(np.float32)
if val_raw.dtype == np.int16: val_neg /= 256.0
if val_neg.shape[1] != EMB_DIM:
    if val_neg.shape[1] > EMB_DIM: val_neg = val_neg[:, :EMB_DIM]
    else: val_neg = np.pad(val_neg, ((0,0),(0, EMB_DIM - val_neg.shape[1])))

print(f'Negative: {neg_f.shape[0]} train, {val_neg.shape[0]} val (ratio {neg_f.shape[0]/pos_features.shape[0]:.0f}:1)')

In [None]:
# === 6. TRAIN ===
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

all_X = np.concatenate([pos_features, neg_f])
feat_mean, feat_std = all_X.mean(0), all_X.std(0) + 1e-8
X_pos = (pos_features - feat_mean) / feat_std
X_neg = (neg_f - feat_mean) / feat_std

nv = max(int(len(X_pos) * 0.15), 10)
p = np.random.permutation(len(X_pos))
X_pv, X_pt = X_pos[p[:nv]], X_pos[p[nv:]]

X_tr = np.concatenate([X_pt, X_neg])
y_tr = np.concatenate([np.ones(len(X_pt)), np.zeros(len(X_neg))])
s = np.random.permutation(len(X_tr))
X_tr, y_tr = X_tr[s], y_tr[s]

vn = ((val_neg - feat_mean) / feat_std)[:2000]
X_val = np.concatenate([X_pv, vn])
y_val = np.concatenate([np.ones(len(X_pv)), np.zeros(len(vn))])

loader = DataLoader(TensorDataset(torch.FloatTensor(X_tr), torch.FloatTensor(y_tr)), batch_size=512, shuffle=True)
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {dev} | Train: {len(X_pt)} pos + {len(X_neg)} neg | Val: {nv} pos + {len(vn)} neg')

model = nn.Sequential(nn.Linear(EMB_DIM, 32), nn.ReLU(), nn.Linear(32, 1)).to(dev)
pw = torch.tensor([len(X_neg) / max(len(X_pt), 1)]).to(dev)
crit = nn.BCEWithLogitsLoss(pos_weight=pw)
opt = optim.Adam(model.parameters(), lr=0.001)
sched = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=75)

best_acc, best_st = 0, None
for ep in range(75):
    model.train()
    tl = 0
    for xb, yb in loader:
        xb, yb = xb.to(dev), yb.to(dev)
        opt.zero_grad()
        l = crit(model(xb).squeeze(), yb); l.backward(); opt.step(); tl += l.item()
    sched.step()
    model.eval()
    with torch.no_grad():
        vo = model(torch.FloatTensor(X_val).to(dev)).squeeze()
        vp = (torch.sigmoid(vo) > 0.5).cpu().numpy()
        acc = (vp == y_val).mean()
        rec = vp[y_val==1].mean() if (y_val==1).sum() > 0 else 0
        fpr = vp[y_val==0].mean() if (y_val==0).sum() > 0 else 0
    if acc > best_acc: best_acc = acc; best_st = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    if (ep+1) % 10 == 0: print(f'Ep {ep+1:3d} | Loss {tl/len(loader):.4f} | Acc {acc:.3f} | Rec {rec:.3f} | FPR {fpr:.4f}')

model.load_state_dict(best_st)
print(f'\nDone! Best accuracy: {best_acc:.3f}')

In [None]:
# === 7. EXPORT ONNX ===
class Exp(nn.Module):
    def __init__(self, b): super().__init__(); self.b = b
    def forward(self, x): return torch.sigmoid(self.b(x))

em = Exp(model.cpu()).eval()
os.makedirs('output', exist_ok=True)
torch.onnx.export(em, torch.randn(1, EMB_DIM), 'output/claudinho.onnx',
    input_names=['input'], output_names=['output'],
    dynamic_axes={'input':{0:'b'},'output':{0:'b'}}, opset_version=11)
np.savez('output/claudinho_norm.npz', mean=feat_mean, std=feat_std)

ts = ort.InferenceSession('output/claudinho.onnx')
print(f'Model: {os.path.getsize("output/claudinho.onnx")/1024:.1f} KB')

print('\nReal samples:')
for w in samples[:5]:
    sr, a = scipy.io.wavfile.read(w)
    f = audio_to_features(a)
    if len(f)>0:
        fn = (f-feat_mean)/feat_std
        s = ts.run(None,{'input':fn.astype(np.float32)})[0]
        print(f'  {os.path.basename(w)}: {s.max():.3f} {"OK" if s.max()>0.5 else "LOW"}')

print('Noise:')
for i in range(3):
    n = np.random.randn(48000).astype(np.float32)*0.1
    f = audio_to_features(n)
    if len(f)>0:
        fn = (f-feat_mean)/feat_std
        s = ts.run(None,{'input':fn.astype(np.float32)})[0]
        print(f'  noise_{i}: {s.max():.3f}')

In [None]:
# === 8. DOWNLOAD ===
from google.colab import files
files.download('output/claudinho.onnx')
files.download('output/claudinho_norm.npz')
print('Copy both to Pi: ~/claudinho/models/')