This code is showing the how we processing the specific video/voice input and output with the specific names of speakers, the steps are as following:
- loading embedding model and store the known voice dataset into embedding warehouse
- Perform diarization the target video, using pyannote.audio to split and get the slices of voice
- calculate the speaker's embeddings and compare it with the known embedding
- generate transcribed text and label it with identified speakers

In [None]:
from pyannote.audio.pipelines import PretrainedSpeakerEmbedding
import torchaudio
import torch
import numpy as np

# Load pre-trained speaker embedding model
embedding_model = PretrainedSpeakerEmbedding(
    "pyannote/embedding", 
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

# Function to compute embeddings
def get_speaker_embedding(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    return embedding_model({'waveform': waveform, 'sample_rate': sample_rate})

# Store known speaker embeddings
known_speakers = {
    "Ryan": get_speaker_embedding("Ryan.wav"),
    "Jackie": get_speaker_embedding("Jackie.wav"),
    "Rebecca": get_speaker_embedding("Rebecca.wav"),
}


Step2 - Diarization(speaker segmentation)

In [None]:
from pyannote.audio.pipelines import SpeakerDiarization
from pyannote.core import Segment
import wave

# Load diarization model
diarization_pipeline = SpeakerDiarization.from_pretrained("pyannote/speaker-diarization")

# Process the target video/audio file
audio_file = "target_video_audio.wav"
diarization_result = diarization_pipeline(audio_file)

# Extract speaker-wise audio segments
speaker_segments = []
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
    speaker_segments.append({
        "speaker": speaker,
        "start": turn.start,
        "end": turn.end
    })

# Save individual speaker segments as separate audio files
import torchaudio

def extract_audio_segment(input_audio, start_time, end_time, output_file):
    waveform, sample_rate = torchaudio.load(input_audio)
    start_sample = int(start_time * sample_rate)
    end_sample = int(end_time * sample_rate)
    torchaudio.save(output_file, waveform[:, start_sample:end_sample], sample_rate)

# Save each diarized segment for speaker identification
for idx, segment in enumerate(speaker_segments):
    output_path = f"speaker_segment_{idx}.wav"
    extract_audio_segment(audio_file, segment["start"], segment["end"], output_path)
    segment["audio_path"] = output_path  # Store the extracted file path
