# Segmentation
In this final iteration (before I start building into a formal pipeline deliverable) we add functionality to provide a random subset of segments given $x%$ supervision.

In [1]:
import whisper
import torch


# Use CUDA if available
torch.cuda.is_available()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Load resources
model = whisper.load_model("base").to(device)

# Create unlabeled segments
result = model.transcribe('./eleven/11.mp3')
unlabeled_segments = result['segments']

print(unlabeled_segments)

cuda


  checkpoint = torch.load(fp, map_location=device)


[{'id': 0, 'seek': 0, 'start': 0.0, 'end': 8.64, 'text': " I'll pour this pestle on his ear, so will I make the net that will entail them all.", 'tokens': [50364, 286, 603, 2016, 341, 31068, 306, 322, 702, 1273, 11, 370, 486, 286, 652, 264, 2533, 300, 486, 948, 864, 552, 439, 13, 50796], 'temperature': 0.0, 'avg_logprob': -0.3198806361148232, 'compression_ratio': 1.5233644859813085, 'no_speech_prob': 0.1153128519654274}, {'id': 1, 'seek': 0, 'start': 8.64, 'end': 12.84, 'text': " It's an adult, Yago, who says that in Othello.", 'tokens': [50796, 467, 311, 364, 5075, 11, 398, 6442, 11, 567, 1619, 300, 294, 422, 392, 11216, 13, 51006], 'temperature': 0.0, 'avg_logprob': -0.3198806361148232, 'compression_ratio': 1.5233644859813085, 'no_speech_prob': 0.1153128519654274}, {'id': 2, 'seek': 0, 'start': 12.84, 'end': 16.44, 'text': " And it's grown-ups that Machiavelli was writing about.", 'tokens': [51006, 400, 309, 311, 7709, 12, 7528, 300, 12089, 654, 303, 16320, 390, 3579, 466, 13, 51186]

In [2]:
# Create labeled segments from transcript.rttm
def parse_rttm(rttm_file):
    labeled_segments = []
    with open(rttm_file, "r") as file:
        for line in file:
            parts = line.strip().split()
            start = float(parts[3])
            duration = float(parts[4])
            end = start + duration
            speaker = parts[7]
            labeled_segments.append({"start": start, "end": end, "text": "", "speaker": speaker})
    return labeled_segments

rttm_path = 'notebooks/transcript.rttm'
labeled_segments = parse_rttm(rttm_path)

print(labeled_segments)

[{'start': 0.93, 'end': 32.51, 'text': '', 'speaker': 'ira_glass'}, {'start': 32.51, 'end': 72.55, 'text': '', 'speaker': 'ira_glass'}, {'start': 72.55, 'end': 82.69, 'text': '', 'speaker': 'ira_glass'}, {'start': 82.69, 'end': 83.89, 'text': '', 'speaker': 'shirley_jahad'}, {'start': 83.89, 'end': 84.4, 'text': '', 'speaker': 'ira_glass'}, {'start': 84.4, 'end': 88.08000000000001, 'text': '', 'speaker': 'shirley_jahad'}, {'start': 88.08, 'end': 90.63, 'text': '', 'speaker': 'ira_glass'}, {'start': 90.63, 'end': 93.72999999999999, 'text': '', 'speaker': 'shirley_jahad'}, {'start': 93.73, 'end': 95.36, 'text': '', 'speaker': 'ira_glass'}, {'start': 95.36, 'end': 96.14, 'text': '', 'speaker': 'shirley_jahad'}, {'start': 96.14, 'end': 97.37, 'text': '', 'speaker': 'ira_glass'}, {'start': 97.37, 'end': 98.37, 'text': '', 'speaker': 'shirley_jahad'}, {'start': 98.37, 'end': 104.47, 'text': '', 'speaker': 'ira_glass'}, {'start': 104.47, 'end': 109.06, 'text': '', 'speaker': 'shirley_jahad'},

In [3]:
# Given x, randomly select segments.
supervision_coeff = 0.6

total_labeled_segments = len(labeled_segments)
subset_size = int(supervision_coeff * total_labeled_segments)
subset_segments = labeled_segments[:subset_size]

# Embedding

In the semi-supervised pipeline, we need to create sets of embeddings for both the labeled and unlabeled data.

In [4]:
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device(device))

import wave

import numpy as np

from pyannote.audio import Audio
from pyannote.core import Segment

audio = Audio(sample_rate=16000, mono="downmix")
audio_path = 'notebooks/11.wav'

# Calculate duration
with wave.open(audio_path, 'r') as f:
    duration = f.getnframes() / float(f.getframerate())

# Embedding creation function
def create_segment_embedding(segment):
    start = segment['start']
    # True end of the segment, in case Whisper creates a segment beyond the duration.
    end = min(duration, segment['end'])
    block = Segment(start, end)
    waveform, sample_rate = audio.crop(audio_path, block)

    return embedding_model(waveform[None])

# Shape of outputs for each embedding is (192,) so we init np.zeros with that size.

# Create unlabeled embeddings
unlabeled_embeddings = np.zeros(shape=(len(unlabeled_segments), 192))
for i in range(len(unlabeled_segments)):
    unlabeled_embeddings[i] = create_segment_embedding(unlabeled_segments[i])
unlabeled_embeddings = np.nan_to_num(unlabeled_embeddings)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Create labeled embeddings (only the sampled ones)
labeled_embeddings = np.zeros(shape=(len(subset_segments), 192))
labeled_labels = []

for i in range(len(subset_segments)):
    labeled_embeddings[i] = create_segment_embedding(subset_segments[i])
    labeled_labels.append(subset_segments[i]['speaker'])
labeled_embeddings = np.nan_to_num(labeled_embeddings)

labeled_labels = np.array(labeled_labels)

In [6]:
print("Unlabeled Embeddings Shape:", unlabeled_embeddings.shape)
print("Labeled (Subset) Embeddings Shape:", labeled_embeddings.shape)
print("First Unlabeled Embedding:", unlabeled_embeddings[0])
print("First Labeled Embedding with Label:", labeled_embeddings[0], labeled_labels[0])

Unlabeled Embeddings Shape: (867, 192)
Labeled (Subset) Embeddings Shape: (140, 192)
First Unlabeled Embedding: [-1.87485237e+01 -1.18506069e+01  2.73709850e+01 -3.03708954e+01
  2.35487881e+01  4.03589821e+00 -2.30557556e+01  1.04997320e+01
  1.54575624e+01  2.27257118e+01  3.26389732e+01  9.01979256e+00
 -2.08476105e+01 -1.81709175e+01  1.36648197e+01 -3.98308372e+01
 -1.19883680e+01 -4.81564194e-01 -3.67443771e+01  8.51216242e-02
  1.96071835e+01 -2.42728882e+01 -1.26174412e+01  2.61211586e+00
  3.55164032e+01  2.63367863e+01  1.57037859e+01 -1.73465424e+01
  9.25110912e+00 -3.12229271e+01 -3.65996170e+01  2.03769722e+01
 -1.63477516e+01  5.79060936e+00  2.62591801e+01 -3.29880452e+00
 -9.94142950e-01  2.84805336e+01  3.24136848e+01 -4.57150307e+01
  2.36885719e+01  1.60740242e+01 -5.98464537e+00 -5.10546827e+00
  4.95883077e-01 -2.61914959e+01 -6.04377222e+00 -3.58524203e+00
  4.51917877e+01 -3.43655128e+01  1.17791767e+01  8.19137573e+00
  8.57117081e+00 -1.87937317e+01 -4.3757705

# Clustering

In [7]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import AgglomerativeClustering

# With trial and error distance threshold, find initial centroids.
# agglo = AgglomerativeClustering(n_clusters=None, distance_threshold=1150).fit(unlabeled_embeddings)
agglo = AgglomerativeClustering(n_clusters=9, distance_threshold=None).fit(unlabeled_embeddings)
agglo_labels = agglo.labels_

def compute_centroids(data, labels):
    unique_labels = np.unique(labels)
    centroids = []
    for label in unique_labels:
        # Get all points belonging to the current cluster
        cluster_points = data[labels == label]
        # Compute the mean of the points in the cluster to approximate the centroid
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

agglo_centroids = compute_centroids(unlabeled_embeddings, agglo_labels)

# Unlabeled are integers whereas the rttm have strings, so add conversion tables
speaker_name_to_label = {name: i for i, name in enumerate(np.unique(labeled_labels))}
label_to_speaker_name = {v: k for k, v in speaker_name_to_label.items()}

# Convert labeled labels to integers
labeled_labels_int = np.array([speaker_name_to_label[name] for name in labeled_labels])

class COPKMeans:
    def __init__(self, n_clusters, must_link=[], cannot_link=[], max_iter=300):
        self.n_clusters = n_clusters
        self.must_link = must_link
        self.cannot_link = cannot_link
        self.max_iter = max_iter

    def fit(self, X):
        # Step 1: Initialize centroids randomly
        # centroids = X[np.random.choice(X.shape[0], self.n_clusters, replace=False)]
        # centroids = self._kmeans_plus_plus_initialization(X)
        centroids = agglo_centroids
        labels = np.full(X.shape[0], -1)
        
        for _ in range(self.max_iter):
            new_labels = np.full(X.shape[0], -1)
            # new_labels = np.zeros(X.shape[0])
            
            # Step 2: Assign points to the nearest cluster considering constraints
            for i, point in enumerate(X):
                distances = euclidean_distances([point], centroids).flatten()
                
                # Sort clusters by distance
                sorted_clusters = np.argsort(distances)
                for cluster_id in sorted_clusters:
                    if self._satisfies_constraints(i, cluster_id, new_labels):
                        new_labels[i] = cluster_id
                        break
            
            # Step 3: Check for convergence
            if np.array_equal(labels, new_labels):
                break
            labels = new_labels
            
            # Step 4: Update centroids
            for cluster_id in range(self.n_clusters):
                points_in_cluster = X[labels == cluster_id]
                if len(points_in_cluster) > 0:
                    centroids[cluster_id] = np.mean(points_in_cluster, axis=0)
        
        self.labels_ = labels
        self.cluster_centers_ = centroids
    
    def _satisfies_constraints(self, point_index, cluster_id, labels):
        for (i, j) in self.must_link:
            if point_index == i and labels[j] != cluster_id:
                return False
            if point_index == j and labels[i] != cluster_id:
                return False
        
        for (i, j) in self.cannot_link:
            if point_index == i and labels[j] == cluster_id:
                return False
            if point_index == j and labels[i] == cluster_id:
                return False
        
        return True
    
    def _kmeans_plus_plus_initialization(self, X):
        unique_labels = np.unique(labeled_labels)
        n_labeled_clusters = len(unique_labels)
        assert n_labeled_clusters <= self.n_clusters

        # X is the dataset, k is the number of clusters
        centroids = []

        # Step 1: Initialize centroids with labeled data
        for label in unique_labels:
            label_cluster = labeled_embeddings[labeled_labels == label]
            centroid = np.mean(label_cluster, axis=0)
            centroids.append(centroid)

        # Step 2: Use k-means++ for the remaining centroids
        remaining_centroids_needed = self.n_clusters - n_labeled_clusters
        if remaining_centroids_needed > 0:
            # Select remaining centroids using k-means++
            distances = np.min(euclidean_distances(X, centroids), axis=1) ** 2

            for _ in range(remaining_centroids_needed):
                probabilities = distances / distances.sum()
                next_centroid_idx = np.random.choice(X.shape[0], p=probabilities)
                centroids.append(X[next_centroid_idx])

                # Update distances with the new centroid
                new_distances = euclidean_distances(X, [X[next_centroid_idx]]) ** 2
                distances = np.minimum(distances, new_distances.flatten())
        return np.array(centroids)


def generate_constraints(labeled_labels):
    must_link = []
    cannot_link = []
    
    # Generate must-link constraints
    for i in range(len(labeled_labels)):
        for j in range(i + 1, len(labeled_labels)):
            if labeled_labels[i] == labeled_labels[j]:
                must_link.append((i, j))
            else:
                cannot_link.append((i, j))
    
    return must_link, cannot_link

# First, we merge labeled and unlabeled embeddings
combined_embeddings = np.vstack((labeled_embeddings, unlabeled_embeddings))
mlink, clink = generate_constraints(labeled_labels_int)

cop_kmeans = COPKMeans(n_clusters=len(agglo_centroids), must_link=mlink, cannot_link=clink)
cop_kmeans.fit(combined_embeddings)
cluster_labels = cop_kmeans.labels_[len(labeled_embeddings):]

# Number of speakers from cluster
from collections import Counter
print(f"Number of Distinct Clusters: {len(Counter(cluster_labels))}")

Number of Distinct Clusters: 9


In [10]:
import datetime
def time(secs):
  return datetime.timedelta(seconds=round(secs))

f = open("notebooks/agglo-cop_transcript.txt", "w")

for i in range(len(unlabeled_segments)):
    # For integer id'd speakers
    unlabeled_segments[i]["speaker"] = 'SPEAKER ' + str(int(cluster_labels[i] + 1))

for (i, segment) in enumerate(unlabeled_segments):
    if i == 0 or unlabeled_segments[i - 1]["speaker"] != segment["speaker"]:
        print("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n', end='')
        f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
    
    f.write(segment["text"][1:] + ' ')
    print(segment["text"][1:] + ' ', end='')
f.close()


SPEAKER 3 0:00:00
I'll pour this pestle on his ear, so will I make the net that will entail them all. It's an adult, Yago, who says that in Othello. And it's grown-ups that Machiavelli was writing about. When he wrote the prints, his book about manipulating others and seizing power. Notice he titled the book The Prince, not The Little Prince. The Little Prince is actually by somebody else. If you don't know that. But in our American lives, the real era of intrigue and manipulation for most of us is not adulthood. It's adolescence. When our social circle is at its most, constricting. Today on our program, a story of betrayal. And of someone who holds David Kuresh like powers over others. And who is only in the seventh grade. From WB Easy in Chicago. It's your radio playhouse. I'm Ira Glass. But before we get into the body of our story, we will try as adults to manipulate you a little bit. And put Central. Let's check in with Pledge Central. Shirley Jihad. 
SPEAKER 4 0:01:23
Hi, Ira Gla