# Segmentation

In [1]:
import whisper
import torch

torch.cuda.is_available()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Load resources
model = whisper.load_model("base").to(device)

result = model.transcribe('./eleven/11.mp3')
segments = result['segments']

print(segments)

cuda


  checkpoint = torch.load(fp, map_location=device)


[{'id': 0, 'seek': 0, 'start': 0.0, 'end': 8.64, 'text': " I'll pour this pestle on his ear, so will I make the net that will entail them all.", 'tokens': [50364, 286, 603, 2016, 341, 31068, 306, 322, 702, 1273, 11, 370, 486, 286, 652, 264, 2533, 300, 486, 948, 864, 552, 439, 13, 50796], 'temperature': 0.0, 'avg_logprob': -0.3198806361148232, 'compression_ratio': 1.5233644859813085, 'no_speech_prob': 0.1153128519654274}, {'id': 1, 'seek': 0, 'start': 8.64, 'end': 12.84, 'text': " It's an adult, Yago, who says that in Othello.", 'tokens': [50796, 467, 311, 364, 5075, 11, 398, 6442, 11, 567, 1619, 300, 294, 422, 392, 11216, 13, 51006], 'temperature': 0.0, 'avg_logprob': -0.3198806361148232, 'compression_ratio': 1.5233644859813085, 'no_speech_prob': 0.1153128519654274}, {'id': 2, 'seek': 0, 'start': 12.84, 'end': 16.44, 'text': " And it's grown-ups that Machiavelli was writing about.", 'tokens': [51006, 400, 309, 311, 7709, 12, 7528, 300, 12089, 654, 303, 16320, 390, 3579, 466, 13, 51186]

# Embedding

In [21]:
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device(device))

import wave

import numpy as np

from pyannote.audio import Audio
from pyannote.core import Segment
audio = Audio(sample_rate=16000, mono="downmix")

audio_path = 'notebooks/11.wav'

# Calculate duration
with wave.open(audio_path, 'r') as f:
    duration = f.getnframes() / float(f.getframerate())

# Embedding creation function
def create_segment_embedding(segment):
    start = segment['start']
    # True end of the segment, in case Whisper creates a segment beyond the duration.
    end = min(duration, segment['end'])
    block = Segment(start, end)
    waveform, sample_rate = audio.crop(audio_path, block)

    return embedding_model(waveform[None])

# Shape of outputs for each embedding is (192,) so we init np.zeros with that size.

# Create embeddings
embeddings = np.zeros(shape=(len(segments), 192))
for i in range(len(segments)):
    embeddings[i] = create_segment_embedding(segments[i])
embeddings = np.nan_to_num(embeddings)

# Clustering

In [37]:
from sklearn.cluster import AgglomerativeClustering

agglo = AgglomerativeClustering(n_clusters=None, distance_threshold=1150).fit(embeddings)
labels = agglo.labels_

# Number of speakers from cluster
from collections import Counter
print(f"Number of Distinct Clusters: {len(Counter(labels).keys())}")

import datetime
def time(secs):
  return datetime.timedelta(seconds=round(secs))

f = open("notebooks/transcript.txt", "w")

for i in range(len(segments)):
    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)


for (i, segment) in enumerate(segments):
    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
        print("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n', end='')
        f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
    
    f.write(segment["text"][1:] + ' ')
    print(segment["text"][1:] + ' ', end='')
f.close()

Number of Distinct Clusters: 9

SPEAKER 3 0:00:00
I'll pour this pestle on his ear, so will I make the net that will entail them all. It's an adult, Yago, who says that in Othello. And it's grown-ups that Machiavelli was writing about. When he wrote the prints, his book about manipulating others and seizing power. Notice he titled the book The Prince, not The Little Prince. The Little Prince is actually by somebody else. If you don't know that. But in our American lives, the real era of intrigue and manipulation for most of us is not adulthood. It's adolescence. When our social circle is at its most, constricting. Today on our program, a story of betrayal. And of someone who holds David Kuresh like powers over others. And who is only in the seventh grade. From WB Easy in Chicago. It's your radio playhouse. I'm Ira Glass. But before we get into the body of our story, we will try as adults to manipulate you a little 
SPEAKER 1 0:01:18
bit. And put Central. 
SPEAKER 3 0:01:20
Let's check 