# Train Custom "Claudinho" Wake Word Model

This notebook trains an openWakeWord model using **real voice recordings** instead of synthetic TTS.

**Runtime: Use GPU** (Runtime ‚Üí Change runtime type ‚Üí T4 GPU)

Total time: ~30-45 minutes

## 1. Install Dependencies

In [None]:
# Install openWakeWord and training dependencies
!git clone https://github.com/dscripka/openWakeWord
!pip install -e ./openWakeWord

# Install piper-sample-generator (for adversarial negative generation)
!git clone https://github.com/rhasspy/piper-sample-generator
!wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'
!pip install piper-phonemize
!pip install webrtcvad

# Training dependencies
!pip install mutagen==1.47.0
!pip install torchinfo==1.8.0
!pip install torchmetrics==1.2.0
!pip install speechbrain==0.5.14
!pip install audiomentations==0.33.0
!pip install torch-audiomentations==0.11.0
!pip install acoustics==0.2.6
!pip install pronouncing==0.2.0
!pip install datasets==2.14.6
!pip install deep-phonemizer==0.0.19

# Fix torchaudio compatibility (set_audio_backend removed in newer versions)
import torchaudio
if not hasattr(torchaudio, 'set_audio_backend'):
    torchaudio.set_audio_backend = lambda x: None

# Also patch the file on disk so subprocess calls work
import site, glob as g
for f in g.glob(site.getsitepackages()[0] + '/torch_audiomentations/utils/io.py'):
    txt = open(f).read()
    if 'set_audio_backend' in txt:
        txt = txt.replace('torchaudio.set_audio_backend("soundfile")', 
                          'pass  # patched: set_audio_backend removed in newer torchaudio')
        open(f, 'w').write(txt)
        print('Patched torch_audiomentations/utils/io.py')

# Download openWakeWord embedding models
import os
os.makedirs('./openWakeWord/openwakeword/resources/models', exist_ok=True)
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx -O ./openWakeWord/openwakeword/resources/models/embedding_model.onnx
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite -O ./openWakeWord/openwakeword/resources/models/embedding_model.tflite
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx -O ./openWakeWord/openwakeword/resources/models/melspectrogram.onnx
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite -O ./openWakeWord/openwakeword/resources/models/melspectrogram.tflite

print('\n‚úÖ Dependencies installed!')

## 2. Get Training Data

In [None]:
# Clone the Claudinho repo to get the recorded voice samples
!git clone https://github.com/claudinhocoding/claudinho.git claudinho_repo

import glob
samples = sorted(glob.glob('claudinho_repo/training/positive/claudinho/*.wav'))
print(f'\n‚úÖ Found {len(samples)} voice samples')

In [None]:
# Download Room Impulse Responses (for realistic reverb augmentation)
import numpy as np
import scipy
from tqdm import tqdm
import datasets

output_dir = './mit_rirs'
os.makedirs(output_dir, exist_ok=True)
rir_dataset = datasets.load_dataset('davidscripka/MIT_environmental_impulse_responses', split='train', streaming=True)

for row in tqdm(rir_dataset):
    name = row['audio']['path'].split('/')[-1]
    scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

print('‚úÖ Room impulse responses downloaded')

In [None]:
# Download background noise (AudioSet + Free Music Archive)
from pathlib import Path

# AudioSet
os.makedirs('audioset', exist_ok=True)
os.makedirs('./audioset_16k', exist_ok=True)
fname = 'bal_train09.tar'
!wget -O audioset/{fname} 'https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/{fname}'
!cd audioset && tar -xvf {fname}

audioset_dataset = datasets.Dataset.from_dict({'audio': [str(i) for i in Path('audioset/audio').glob('**/*.flac')]})
audioset_dataset = audioset_dataset.cast_column('audio', datasets.Audio(sampling_rate=16000))
for row in tqdm(audioset_dataset):
    name = row['audio']['path'].split('/')[-1].replace('.flac', '.wav')
    scipy.io.wavfile.write(os.path.join('./audioset_16k', name), 16000, (row['audio']['array']*32767).astype(np.int16))

# Free Music Archive (1 hour)
os.makedirs('./fma', exist_ok=True)
fma_dataset = datasets.load_dataset('rudraml/fma', name='small', split='train', streaming=True)
fma_dataset = iter(fma_dataset.cast_column('audio', datasets.Audio(sampling_rate=16000)))
for i in tqdm(range(120)):  # 120 clips √ó 30s = 1 hour
    row = next(fma_dataset)
    name = row['audio']['path'].split('/')[-1].replace('.mp3', '.wav')
    scipy.io.wavfile.write(os.path.join('./fma', name), 16000, (row['audio']['array']*32767).astype(np.int16))

print('‚úÖ Background noise downloaded')

In [None]:
# Download pre-computed negative features and validation data
!wget -q https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy
!wget -q https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy

print('‚úÖ Negative features downloaded')

## 3. Prepare Real Recordings + Generate Adversarial Negatives

In [None]:
import shutil
import random
import wave
import struct

# Create output directory structure that openWakeWord training expects
model_dir = './my_custom_model'
os.makedirs(f'{model_dir}/positive', exist_ok=True)
os.makedirs(f'{model_dir}/positive_val', exist_ok=True)
os.makedirs(f'{model_dir}/negative', exist_ok=True)
os.makedirs(f'{model_dir}/negative_val', exist_ok=True)

# Split recordings: 80% train, 20% validation
random.seed(42)
all_samples = sorted(glob.glob('claudinho_repo/training/positive/claudinho/*.wav'))
random.shuffle(all_samples)

split_idx = int(len(all_samples) * 0.8)
train_samples = all_samples[:split_idx]
val_samples = all_samples[split_idx:]

# Copy to model directory
for i, src in enumerate(train_samples):
    shutil.copy(src, f'{model_dir}/positive/claudinho_{i:04d}.wav')

for i, src in enumerate(val_samples):
    shutil.copy(src, f'{model_dir}/positive_val/claudinho_{i:04d}.wav')

print(f'‚úÖ Split recordings: {len(train_samples)} train, {len(val_samples)} validation')

In [None]:
# Generate adversarial negatives using the openWakeWord pipeline
# These are words that SOUND similar to "claudinho" but aren't it
# This helps the model learn to discriminate

import yaml
import sys

# Load default config
config = yaml.load(open('openWakeWord/examples/custom_model.yml', 'r').read(), yaml.Loader)

# Customize for our use case
config['target_phrase'] = ['claudinho']
config['model_name'] = 'claudinho'
config['n_samples'] = 500  # only need adversarial negatives (not positives)
config['n_samples_val'] = 200
config['output_dir'] = model_dir
config['piper_sample_generator_path'] = './piper-sample-generator'
config['rir_paths'] = ['./mit_rirs']
config['background_paths'] = ['./audioset_16k', './fma']
config['false_positive_validation_data_path'] = 'validation_set_features.npy'
config['feature_data_files'] = {'ACAV100M_sample': 'openwakeword_features_ACAV100M_2000_hrs_16bit.npy'}

# Training params optimized for small real dataset
config['steps'] = 25000
config['augmentation_rounds'] = 10  # multiply 53 recordings into ~530 augmented clips
config['target_accuracy'] = 0.6
config['target_recall'] = 0.3
config['target_false_positives_per_hour'] = 0.5
config['layer_size'] = 32
config['model_type'] = 'dnn'

# Custom adversarial phrases (words that might sound like "claudinho")
config['custom_negative_phrases'] = [
    'cloud', 'cloudy', 'clothing', 'clapping', 'climbing',
    'cleaning', 'clown', 'clone', 'close', 'club',
    'calling', 'coming', 'coding', 'counting', 'cooling'
]

with open('claudinho_config.yaml', 'w') as f:
    yaml.dump(config, f)

print('‚úÖ Training config written')
print(f'   Augmentation rounds: {config["augmentation_rounds"]}')
print(f'   Training steps: {config["steps"]}')

In [None]:
# Generate ONLY the adversarial negatives (we already have real positives)
# This uses Piper TTS to create English words that sound similar
!{sys.executable} openWakeWord/openwakeword/train.py --training_config claudinho_config.yaml --generate_clips

print('\n‚úÖ Adversarial negatives generated')

## 4. Augment and Train

In [None]:
# Augment clips (adds noise, reverb, volume variation to all clips)
# With augmentation_rounds=10, our ~53 train recordings become ~530 unique augmented clips
!{sys.executable} openWakeWord/openwakeword/train.py --training_config claudinho_config.yaml --augment_clips

print('\n‚úÖ Augmentation complete')

In [None]:
# Train the model! This is the main training step (~15-25 min on T4 GPU)
!{sys.executable} openWakeWord/openwakeword/train.py --training_config claudinho_config.yaml --train_model

print('\n‚úÖ Training complete!')

## 5. Download the Model

In [None]:
# Check output
model_files = glob.glob(f'{model_dir}/*.onnx') + glob.glob(f'{model_dir}/*.tflite')
print('Generated model files:')
for f in model_files:
    size = os.path.getsize(f) / 1024
    print(f'  {f} ({size:.1f} KB)')

# Download the .onnx file
from google.colab import files
onnx_files = glob.glob(f'{model_dir}/*.onnx')
if onnx_files:
    files.download(onnx_files[0])
    print(f'\nüéâ Download started: {onnx_files[0]}')
    print('\nNext steps:')
    print('  1. Copy to Pi: scp claudinho.onnx claudinho@claudinho.local:~/claudinho/models/')
    print('  2. Update config.py: WAKE_WORD_MODEL = Path.home() / "claudinho" / "models" / "claudinho.onnx"')
    print('  3. Restart service: sudo systemctl restart claudinho')
else:
    print('‚ùå No .onnx file found ‚Äî check training output above for errors')