# Segmentation

In [3]:
import whisper
import torch

torch.cuda.is_available()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Load resources
model = whisper.load_model("base").to(device)

result = model.transcribe('./eleven/11.mp3')
segments = result['segments']

print(segments)

# Embedding

In [None]:
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device(device))

import wave

import numpy as np

from pyannote.audio import Audio
from pyannote.core import Segment
audio = Audio(sample_rate=16000, mono="downmix")

audio_path = 'notebooks/11.wav'

# Calculate duration
with wave.open(audio_path, 'r') as f:
    duration = f.getnframes() / float(f.getframerate())

# Embedding creation function
def create_segment_embedding(segment):
    start = segment['start']
    # True end of the segment, in case Whisper creates a segment beyond the duration.
    end = min(duration, segment['end'])
    block = Segment(start, end)
    waveform, sample_rate = audio.crop(audio_path, block)

    return embedding_model(waveform[None])

# Shape of outputs for each embedding is (192,) so we init np.zeros with that size.

# Create embeddings
embeddings = np.zeros(shape=(len(segments), 192))
for i in range(len(segments)):
    embeddings[i] = create_segment_embedding(segments[i])
embeddings = np.nan_to_num(embeddings)

  from .autonotebook import tqdm as notebook_tqdm


# Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

agglo = AgglomerativeClustering(n_clusters=None, distance_threshold=1150).fit(embeddings)
labels = agglo.labels_

# Number of speakers from cluster
from collections import Counter
print(f"Number of Distinct Clusters: {len(Counter(labels).keys())}")

import datetime
def time(secs):
  return datetime.timedelta(seconds=round(secs))

f = open("notebooks/base_transcript.txt", "w")

for i in range(len(segments)):
    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)


for (i, segment) in enumerate(segments):
    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
        print("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n', end='')
        f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
    
    f.write(segment["text"][1:] + ' ')
    print(segment["text"][1:] + ' ', end='')
f.close()

NameError: name 'embeddings' is not defined