# Train Custom "Claudinho" Wake Word Model

**Runtime: GPU** (Runtime > Change runtime type > T4 GPU) then **Runtime > Run all**

In [None]:
# === 1. SETUP ===
!pip install -q openwakeword onnxruntime numpy scipy torch
!git clone -q https://github.com/claudinhocoding/claudinho.git claudinho_repo
!wget -q https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy
!wget -q https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy

import openwakeword, os, glob
oww_dir = os.path.join(os.path.dirname(openwakeword.__file__), 'resources', 'models')
os.makedirs(oww_dir, exist_ok=True)
!wget -q https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx -O {oww_dir}/melspectrogram.onnx
!wget -q https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx -O {oww_dir}/embedding_model.onnx

samples = sorted(glob.glob('claudinho_repo/training/positive/claudinho/*.wav'))
print(f'Setup complete: {len(samples)} samples')

In [None]:
# === 2. FEATURE EXTRACTION ===
# Shapes (verified):
#   Mel: input 'input' (1,1280) -> output (1,1,5,32) = 5 frames of 32 mel bands per 80ms
#   Emb: input 'input_1' (batch,76,32,1) -> output (batch,1,1,96) = 96-dim embedding
#   Need 76 mel frames = ceil(76/5) = 16 chunks = 1.28s of audio

import numpy as np
import scipy.io.wavfile
import scipy.signal
import onnxruntime as ort

mel_sess = ort.InferenceSession(os.path.join(oww_dir, 'melspectrogram.onnx'))
emb_sess = ort.InferenceSession(os.path.join(oww_dir, 'embedding_model.onnx'))

WINDOW = 1280          # 80ms at 16kHz
MELS_PER_CHUNK = 5     # mel model outputs 5 frames per chunk
N_MEL_FRAMES = 76      # embedding model needs 76 mel frames
CHUNKS_FOR_EMB = 16    # ceil(76/5) = 16 chunks = 1.28s
EMB_DIM = 96

def audio_to_features(audio):
    """Extract 96-dim embeddings from 16kHz audio."""
    audio = np.asarray(audio, dtype=np.float32)
    if np.abs(audio).max() > 1.5:
        audio = audio / 32768.0

    # Minimum: 16 chunks * 1280 = 20480 samples (1.28s)
    min_samples = CHUNKS_FOR_EMB * WINDOW
    if len(audio) < min_samples:
        pad = min_samples - len(audio)
        audio = np.pad(audio, (pad // 2, pad - pad // 2))

    # Get mel frames from each 80ms chunk
    # Each chunk -> (1,1,5,32), we extract the (5,32) part
    all_mel_frames = []
    for s in range(0, len(audio) - WINDOW + 1, WINDOW):
        chunk = audio[s:s+WINDOW].reshape(1, -1).astype(np.float32)
        mel_out = mel_sess.run(None, {'input': chunk})[0]  # (1,1,5,32)
        frames = mel_out.reshape(-1, 32)  # (5, 32)
        all_mel_frames.append(frames)

    if not all_mel_frames:
        return np.empty((0, EMB_DIM))

    mel_stack = np.concatenate(all_mel_frames, axis=0)  # (total_frames, 32)

    # Sliding window: 76 mel frames -> 1 embedding, step by 5 frames (one chunk)
    feats = []
    for i in range(0, len(mel_stack) - N_MEL_FRAMES + 1, MELS_PER_CHUNK):
        batch = mel_stack[i:i+N_MEL_FRAMES]          # (76, 32)
        batch = batch.reshape(1, 76, 32, 1).astype(np.float32)  # (1, 76, 32, 1)
        emb = emb_sess.run(None, {'input_1': batch})[0]  # (1, 1, 1, 96)
        feats.append(emb.flatten())  # (96,)

    return np.array(feats) if feats else np.empty((0, EMB_DIM))

# Quick test
sr, test_audio = scipy.io.wavfile.read(samples[0])
test_feats = audio_to_features(test_audio)
print(f'Test: {os.path.basename(samples[0])} -> {test_feats.shape} embeddings')
print(f'Embedding dim: {test_feats.shape[1] if len(test_feats) > 0 else "FAILED"}')

In [None]:
# === 3. AUGMENT & EXTRACT ALL ===
def augment(audio):
    a = audio.astype(np.float32)
    v = [a]
    for nl in [0.003, 0.008, 0.015, 0.025]:
        v.append(a + np.random.randn(len(a)) * nl * 32768)
    for g in [0.4, 0.6, 0.8, 1.2, 1.5, 2.0]:
        v.append(a * g)
    for sh in [1600, 3200, 6400]:
        v.append(np.concatenate([np.zeros(sh), a]))
        v.append(np.concatenate([a, np.zeros(sh)]))
    for rate in [0.85, 0.92, 1.08, 1.15]:
        n = int(len(a) / rate)
        v.append(np.interp(np.linspace(0, len(a)-1, n), np.arange(len(a)), a))
    for nl, g in [(0.005, 0.7), (0.01, 1.3), (0.02, 0.5)]:
        v.append((a + np.random.randn(len(a)) * nl * 32768) * g)
    return v

print('Extracting features with augmentation...')
all_feats = []
for i, wav in enumerate(samples):
    sr, audio = scipy.io.wavfile.read(wav)
    if sr != 16000:
        audio = scipy.signal.resample(audio, int(len(audio) * 16000 / sr)).astype(np.int16)
    for aug in augment(audio):
        f = audio_to_features(aug)
        if len(f) > 0:
            all_feats.append(f)
    if (i+1) % 10 == 0:
        print(f'  {i+1}/{len(samples)}...')

pos_features = np.concatenate(all_feats)
print(f'\nPositive: {pos_features.shape[0]} embeddings ({pos_features.shape[0]/len(samples):.0f}x per sample)')

In [None]:
# === 4. LOAD NEGATIVES (memory-efficient) ===
# File is 3D: (N, 16, 96) â€” each row has 16 consecutive 96-dim embeddings
neg_mmap = np.load('openwakeword_features_ACAV100M_2000_hrs_16bit.npy', mmap_mode='r')
print(f'Negative file: {neg_mmap.shape}, dtype={neg_mmap.dtype}')

# Sample rows, then flatten 16 embeddings per row into individual samples
n_pos = pos_features.shape[0]
rows_needed = min(len(neg_mmap), (n_pos * 20) // 16 + 1)  # 20:1 ratio
idx = np.sort(np.random.choice(len(neg_mmap), rows_needed, replace=False))
neg_sampled = neg_mmap[idx].astype(np.float32)  # (rows, 16, 96)
neg_f = neg_sampled.reshape(-1, EMB_DIM)  # flatten to (rows*16, 96)
neg_f = neg_f[:n_pos * 20]  # trim to exact ratio
del neg_mmap, neg_sampled

# Validation negatives
val_mmap = np.load('validation_set_features.npy', mmap_mode='r')
print(f'Val file: {val_mmap.shape}, dtype={val_mmap.dtype}')
val_neg = val_mmap[:2000].astype(np.float32)
if val_neg.ndim == 3:  # (N, 16, 96) -> (N*16, 96)
    val_neg = val_neg.reshape(-1, EMB_DIM)
del val_mmap

print(f'Sampled: {neg_f.shape[0]} train neg, {val_neg.shape[0]} val neg ({neg_f.shape[0]/n_pos:.0f}:1 ratio)')

In [None]:
# === 5. TRAIN ===
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

all_X = np.concatenate([pos_features, neg_f])
feat_mean, feat_std = all_X.mean(0), all_X.std(0) + 1e-8
X_pos = (pos_features - feat_mean) / feat_std
X_neg = (neg_f - feat_mean) / feat_std

nv = max(int(len(X_pos) * 0.15), 10)
p = np.random.permutation(len(X_pos))
X_pv, X_pt = X_pos[p[:nv]], X_pos[p[nv:]]

X_tr = np.concatenate([X_pt, X_neg])
y_tr = np.concatenate([np.ones(len(X_pt)), np.zeros(len(X_neg))])
s = np.random.permutation(len(X_tr)); X_tr, y_tr = X_tr[s], y_tr[s]

vn = ((val_neg - feat_mean) / feat_std)[:2000]
X_val = np.concatenate([X_pv, vn])
y_val = np.concatenate([np.ones(len(X_pv)), np.zeros(len(vn))])

loader = DataLoader(TensorDataset(torch.FloatTensor(X_tr), torch.FloatTensor(y_tr)), batch_size=512, shuffle=True)
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {dev} | Train: {len(X_pt)} pos + {len(X_neg)} neg | Val: {nv} pos + {len(vn)} neg')

model = nn.Sequential(nn.Linear(EMB_DIM, 32), nn.ReLU(), nn.Linear(32, 1)).to(dev)
pw = torch.tensor([len(X_neg) / max(len(X_pt), 1)]).to(dev)
crit = nn.BCEWithLogitsLoss(pos_weight=pw)
opt = optim.Adam(model.parameters(), lr=0.001)
sched = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=75)

best_acc, best_st = 0, None
for ep in range(75):
    model.train(); tl = 0
    for xb, yb in loader:
        xb, yb = xb.to(dev), yb.to(dev)
        opt.zero_grad(); l = crit(model(xb).squeeze(), yb); l.backward(); opt.step(); tl += l.item()
    sched.step()
    model.eval()
    with torch.no_grad():
        vo = model(torch.FloatTensor(X_val).to(dev)).squeeze()
        vp = (torch.sigmoid(vo) > 0.5).cpu().numpy()
        acc = (vp == y_val).mean()
        rec = vp[y_val==1].mean() if (y_val==1).sum() > 0 else 0
        fpr = vp[y_val==0].mean() if (y_val==0).sum() > 0 else 0
    if acc > best_acc: best_acc = acc; best_st = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    if (ep+1) % 10 == 0: print(f'Ep {ep+1:3d} | Loss {tl/len(loader):.4f} | Acc {acc:.3f} | Rec {rec:.3f} | FPR {fpr:.4f}')

model.load_state_dict(best_st)
print(f'\nDone! Best accuracy: {best_acc:.3f}')

In [None]:
# === 6. EXPORT & TEST ===
class Exp(nn.Module):
    def __init__(self, b): super().__init__(); self.b = b
    def forward(self, x): return torch.sigmoid(self.b(x))

em = Exp(model.cpu()).eval()
os.makedirs('output', exist_ok=True)
torch.onnx.export(em, torch.randn(1, EMB_DIM), 'output/claudinho.onnx',
    input_names=['input'], output_names=['output'],
    dynamic_axes={'input':{0:'b'},'output':{0:'b'}}, opset_version=11)
np.savez('output/claudinho_norm.npz', mean=feat_mean, std=feat_std)

ts = ort.InferenceSession('output/claudinho.onnx')
print(f'Model: {os.path.getsize("output/claudinho.onnx")/1024:.1f} KB')

print('\nPositive samples (should be > 0.5):')
for w in samples[:5]:
    sr, a = scipy.io.wavfile.read(w)
    f = audio_to_features(a)
    if len(f) > 0:
        fn = (f - feat_mean) / feat_std
        s = ts.run(None, {'input': fn.astype(np.float32)})[0]
        print(f'  {os.path.basename(w)}: {s.max():.3f} {"OK" if s.max() > 0.5 else "LOW"}')

print('\nNoise (should be < 0.3):')
for i in range(3):
    n = np.random.randn(48000).astype(np.float32) * 0.1
    f = audio_to_features(n)
    if len(f) > 0:
        fn = (f - feat_mean) / feat_std
        s = ts.run(None, {'input': fn.astype(np.float32)})[0]
        print(f'  noise_{i}: {s.max():.3f}')

In [None]:
# === 7. DOWNLOAD ===
from google.colab import files
files.download('output/claudinho.onnx')
files.download('output/claudinho_norm.npz')
print('Copy both to Pi: ~/claudinho/models/')