Voice Registration
- model used: Speechbrain 
- procedure : 
  - register the user's voice (3~6s voice is needed)
  - verify the voice
    - capture the real-time voice
    - load the voice
    - verify with scores
    - return result

In [1]:
import torch
from speechbrain.pretrained import EncoderClassifier
from scipy.spatial.distance import cosine

# loading
speaker_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

# store registered speaker's embedding vector
registered_speakers = {}

def register_speaker(audio_path, speaker_name):
    """register speaker's voice"""
    # load audio
    audio_signal = torch.tensor([load_audio(audio_path)])

    embeddings = speaker_model.encode_batch(audio_signal)
    embedding_vector = embeddings.squeeze().detach().numpy()
    registered_speakers[speaker_name] = embedding_vector
    print(f"Speaker '{speaker_name}' registered successfully!")


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import EncoderClassifier
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'spe

In [2]:
import torchaudio

def load_audio(audio_path):
    """load audio file and convert to specific sample rate"""
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != 16000:
        resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resample_transform(waveform)
    return waveform.squeeze().numpy()


In [4]:
import soundfile as sf
import tempfile
import numpy as np

def save_to_temp_wav(audio_data, sample_rate=16000):
    """
        save audio bytes stream to temporary WAV file
        Args:
            audio_data (bytes): audio data bytes stream
            sample_rate (int): audio sample rate, default 16kHz
        Returns:
            str: path of the temporary WAV file
    """
    # create a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    temp_file.close()

    # write audio bytes stream to WAV file
    with sf.SoundFile(temp_file.name, mode="w", samplerate=sample_rate, channels=1, subtype="PCM_16") as f:
        # convert bytes stream to NumPy array
        audio_array = np.frombuffer(audio_data, dtype=np.int16)
        f.write(audio_array)
    
    return temp_file.name


In [5]:
audio_cache = []  # overall audio cache

def verify_realtime_speaker(audio_chunk, speaker_name, threshold=0.1, min_duration=3.0, adaptive=True):
    """
    real-time speaker verification, support cumulative audio and dynamic threshold adjustment
    """
    global audio_cache

    if speaker_name not in registered_speakers:
        print(f"Error: Speaker '{speaker_name}' not registered.")
        return False

    audio_cache.append(audio_chunk)
    total_audio = b"".join(audio_cache)
    duration = len(total_audio) / (16000 * 2)  # cumulative audio duration

    if duration >= min_duration:
        print("Performing speaker verification with accumulated audio...")
        try:
            audio_array = np.frombuffer(total_audio, dtype=np.int16)
            embeddings = speaker_model.encode_batch(torch.tensor([audio_array], dtype=torch.float32))
            embeddings = embeddings.squeeze().detach().numpy()
        except Exception as e:
            print(f"Error during embedding extraction: {e}")
            return False

        similarity = 1 - cosine(embeddings, registered_speakers[speaker_name])
        print(f"Similarity score with '{speaker_name}': {similarity}")

        # dynamic adjust threshold
        adjusted_threshold = threshold
        if adaptive:
            adjusted_threshold = max(threshold, similarity - 0.1)

        audio_cache = []  # clear cache
        return similarity > adjusted_threshold

    return False  # audio accumulation is insufficient


In [6]:
import soundfile as sf
import numpy as np

def save_realtime_audio(audio_cache, sample_rate=16000, output_path="/Users/7one/Documents/Work/mangoesai/livekit_paddle/realtime_audio_debug.wav"):
    """
    save real-time audio to WAV file
    Args:
        audio_cache (list): real-time recording cache, each element is bytes type
        sample_rate (int): audio sample rate, default 16kHz
        output_path (str): path of the audio file
    """
    # merge all audio blocks
    full_audio_data = b"".join(audio_cache)

    # convert bytes stream to NumPy array
    audio_array = np.frombuffer(full_audio_data, dtype=np.int16)

    # save as WAV file
    sf.write(output_path, audio_array, samplerate=sample_rate, subtype="PCM_16")
    print(f"Realtime audio saved to: {output_path}")


In [7]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
import soundfile as sf
import numpy as np
import os
import time
from datetime import datetime

# save real-time audio
def save_realtime_audio(audio_cache, sample_rate=16000, output_path="realtime_audio_debug.wav"):
    """
    save real-time audio to WAV file
    Args:
        audio_cache (list): real-time recording cache, each element is bytes type
        sample_rate (int): audio sample rate, default 16kHz
        output_path (str): path of the audio file
    """
    # merge all audio blocks
    full_audio_data = b"".join(audio_cache)

    # convert bytes stream to NumPy array
    audio_array = np.frombuffer(full_audio_data, dtype=np.int16)

    # save as WAV file
    sf.write(output_path, audio_array, samplerate=sample_rate, subtype="PCM_16")
    print(f"Realtime audio saved to: {output_path}")


# real-time recording, verification and transcription
def transcribe_with_speaker_verification(chunk_length_s=10.0, stream_chunk_s=1.0, max_duration=30, threshold=0.4):
    mic = ffmpeg_microphone_live(
        sampling_rate=16000,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print(f"Listening for speaker ...")
    audio_cache = []  # for caching audio blocks
    full_audio = b""  # for accumulating full audio
    identified_user = None  # final identified user
    start_time = time.time()  # record start time

    try:
        for i, audio_chunk in enumerate(mic):
            elapsed_time = time.time() - start_time  # calculate elapsed time
            if elapsed_time >= max_duration:
                print("Reached max duration, stopping recording.")
                break

            print(f"Processing chunk {i + 1} (elapsed time: {elapsed_time:.2f}s)...")
            raw_audio = audio_chunk["raw"]  # get audio bytes stream

            # check and convert audio block to bytes stream
            if isinstance(raw_audio, np.ndarray):
                if raw_audio.dtype == np.float32:  # if float, convert to int16
                    raw_audio = (raw_audio * 32768).astype(np.int16)
                raw_audio = raw_audio.tobytes()
            
            # cache audio block
            audio_cache.append(raw_audio)
            full_audio += raw_audio  # accumulate full audio

            # verify user's voice
            audio_array = np.frombuffer(raw_audio, dtype=np.int16)
            similarity_scores = {}
            for speaker_name, embedding_vector in registered_speakers.items():
                test_embeddings = speaker_model.encode_batch(torch.tensor([audio_array]))
                test_embeddings = test_embeddings.squeeze().detach().numpy()
                similarity = 1 - cosine(test_embeddings, embedding_vector)
                similarity_scores[speaker_name] = similarity
            
            # find the most similar user
            best_match = max(similarity_scores, key=similarity_scores.get)
            best_score = similarity_scores[best_match]
            print(f"Similarity score with '{best_match}': {best_score}")

            if best_score >= threshold:
                print(f"Voice is from '{best_match}', welcome back, Say 'Hi ZZX'!")
                identified_user = best_match
                break
            else:
                print("Voice not recognized. Continuing to listen...")

    except KeyboardInterrupt:
        print("Recording interrupted by user.")

    finally:
        # save cached audio (whether verification is successful or not)
        if audio_cache:
            output_path = "/Users/7one/Documents/Work/mangoesai/livekit_paddle/realtime_audio_debug.wav"
            save_realtime_audio(audio_cache, sample_rate=16000, output_path=output_path)

        if identified_user:
            return identified_user
        else:
            print("No speaker identified.")
            return None


register speakers, you can upload and build your user's voice as required.

In [8]:
register_speaker("/Users/7one/Documents/Work/mangoesai/livekit_paddle/99.wav", "Dengfeng")
register_speaker("/Users/7one/Documents/Work/mangoesai/livekit_paddle/amit.wav", "Amit")


  audio_signal = torch.tensor([load_audio(audio_path)])


Speaker 'Dengfeng' registered successfully!
Speaker 'Amit' registered successfully!


In [9]:
identified_user = transcribe_with_speaker_verification(
    chunk_length_s=10.0,
    stream_chunk_s=1.0,
    max_duration=30,
    threshold=0.4
)
if identified_user:
    print(f"Hello, {identified_user}!")
else:
    print("No speaker recognized.")

Listening for speaker ...




Processing chunk 1 (elapsed time: 6.37s)...
Similarity score with 'Dengfeng': 0.45401132106781006
Voice is from 'Dengfeng', welcome back, Say 'Hi ZZX'!
Realtime audio saved to: /Users/7one/Documents/Work/mangoesai/livekit_paddle/realtime_audio_debug.wav
Hello, Dengfeng!


Discussion: 
- FAISS
- try it on live-stream, if it's a new voice in the radio, register it , if it's 