In [1]:
# Clone repository deep-speaker
!git clone https://github.com/philipperemy/deep-speaker.git
%cd deep-speaker

# Install dependencies
!pip install -r requirements.txt
!pip install librosa soundfile scipy scikit-learn gdown

Cloning into 'deep-speaker'...
remote: Enumerating objects: 2114, done.[K
remote: Counting objects: 100% (227/227), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 2114 (delta 113), reused 187 (delta 104), pack-reused 1887 (from 1)[K
Receiving objects: 100% (2114/2114), 81.51 MiB | 47.15 MiB/s, done.
Resolving deltas: 100% (1102/1102), done.
/content/deep-speaker
Collecting python_speech_features>=0.6 (from -r requirements.txt (line 4))
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python_speech_features
  Building wheel for python_speech_features (setup.py) ... [?25l[?25hdone
  Created wheel for python_speech_features: filename=python_speech_features-0.6-py3-none-any.whl size=5868 sha256=da93bc46d89be6e32ea36b956100740a517deed3775145a9ffa7a54a45edbbe0
  Stored in directory: /root/.cache/pip/wheels/60/90/3c/4b5996a95d363fa14525597a19146a940bec467b44b2

In [2]:
!sed -i 's/regularizers.l2(l=/regularizers.l2(l2=/g' deep_speaker/conv_models.py

In [22]:
!pip install --upgrade ml_dtypes



In [23]:
import sys
import os
import random
import numpy as np
import pandas as pd
import gdown
from scipy.spatial.distance import cosine
from IPython.display import Audio, display

# Import dari deep_speaker
from deep_speaker.audio import read_mfcc
from deep_speaker.batcher import sample_from_mfcc
from deep_speaker.constants import SAMPLE_RATE, NUM_FRAMES
from deep_speaker.conv_models import DeepSpeakerModel

print(f"SAMPLE_RATE: {SAMPLE_RATE} Hz")
print(f"NUM_FRAMES: {NUM_FRAMES} frames")

SAMPLE_RATE: 16000 Hz
NUM_FRAMES: 160 frames


In [31]:
#Download dan Load Model
file_id = '1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP'
output_filename = 'ResCNN_triplet_training_checkpoint_265.h5'

if not os.path.exists(output_filename):
    gdown.download(id=file_id, output=output_filename, quiet=False)

model = DeepSpeakerModel()
model.m.load_weights(output_filename, by_name=True)
print("Model loaded")

Model loaded


In [32]:
def get_voice_embedding(filename):
    """Ekstrak embedding 512-dimensional dari file audio."""
    mfcc = read_mfcc(filename, SAMPLE_RATE)
    mfcc_input = sample_from_mfcc(mfcc, NUM_FRAMES)
    network_input = np.expand_dims(mfcc_input, axis=0)
    embedding = model.m.predict(network_input, verbose=0)
    return embedding.flatten()

def verify_speaker(file1, file2, threshold=0.50):
    """Verifikasi apakah dua file audio berasal dari speaker yang sama."""
    emb1 = get_voice_embedding(file1)
    emb2 = get_voice_embedding(file2)
    similarity = 1 - cosine(emb1, emb2)
    conclusion = "Speaker SAMA" if similarity >= threshold else "Speaker BERBEDA"
    return similarity, conclusion

In [34]:
# Tampilkan dan mainkan audio samples
print("Samples Preview:")
print("=" * 50)

for name, path in audio_samples.items():
    if os.path.exists(path):
        print(f"\n{name}:")
        display(Audio(path))

Samples Preview:

A1 (Philippe):



A2 (Philippe):



B1 (Unknown):


In [29]:
# Cell 7: Pengujian Speaker Recognition
# Audio samples dari repository
audio_samples = {
    "A1 (Philippe)": 'samples/PhilippeRemy/PhilippeRemy_003.wav',
    "A2 (Philippe)": 'samples/PhilippeRemy/PhilippeRemy_004.wav',
    "B1 (Unknown)": 'samples/1255-90413-0001.flac'
}

# Pasangan pengujian
tests = [
    ("Test: Speaker Sama", audio_samples["A1 (Philippe)"], audio_samples["A2 (Philippe)"]),
    ("Test: Speaker Berbeda", audio_samples["A1 (Philippe)"], audio_samples["B1 (Unknown)"])
]

THRESHOLD = 0.50

print("=" * 70)
print("HASIL PERBANDINGAN COSINE SIMILARITY")
print("=" * 70)
print(f"\nThreshold: {THRESHOLD}")
print("Alasan: Threshold 0.5 dipilih karena berada di titik tengah antara")
print("        typical same-speaker (~0.7-0.9) dan different-speaker (~0.1-0.3)")
print("        sehingga memberikan margin keamanan yang cukup.")
print("\n" + "-" * 70)
print(f"{'Pasangan Audio':<30} | {'Similarity':<12} | {'Kesimpulan'}")
print("-" * 70)

for label, file1, file2 in tests:
    score, result = verify_speaker(file1, file2, THRESHOLD)
    print(f"{label:<30} | {score:<12.4f} | {result}")

print("=" * 70)

HASIL PERBANDINGAN COSINE SIMILARITY

Threshold: 0.5
Alasan: Threshold 0.5 dipilih karena berada di titik tengah antara
        typical same-speaker (~0.7-0.9) dan different-speaker (~0.1-0.3)
        sehingga memberikan margin keamanan yang cukup.

----------------------------------------------------------------------
Pasangan Audio                 | Similarity   | Kesimpulan
----------------------------------------------------------------------
Test: Speaker Sama             | 0.8608       | Speaker SAMA
Test: Speaker Berbeda          | 0.0087       | Speaker BERBEDA
