In [None]:
from spatiospectral_diarization.pipeline import SpatioSpectralDiarizationPipeline
import numpy as np
import einops
from lazy_dataset.database import JsonDatabase
import paderbox as pb
import torch

# Application and modification of the diarization pipeline

## Content of this Notebook
This notebook demonstrates how to apply the `SpatioSpectralDiarizationPipeline` to a multi-channel audio signal and how to modify the pipeline for different applications, such as switching from compact to distributed microphone arrays.
Additionally, in the later section of the notebook, example code is provided on how to replace and modify individual components of the pipeline.

### Initializing the LibriWASN database to get dummy signals from

In [None]:
libriwasn = JsonDatabase('libriwasn.json') # PAth to your version of libriwasn
dataset = libriwasn.get_dataset('libriwasn200')  #
example_meeting = dataset[51] # A single meeting from the OV40 subset of LibriWASN

### Loading dummy signals (compact and distributed)

In [None]:
# TODO: Replace dummy code with example sessions of LibriWASN, will be exchanged in the next update
compact_audio_path = example_meeting['audio_path']['osbervation']['asnupb7'] # compact 4-channel device, see documentaiton of LibriWASN for more details
audio_compact = pb.io.load_audio(compact_audio_path)
distributed_devices = ['Pixel6a', 'Pixel6b', 'Pixel7', 'Xiaomi']  # List of smartphone devices in LibriWASN

audio_distributed = [pb.io.load_audio(example_meeting['audio_path']['observation'][device]) for device in distributed_devices]
# The Xiaomi device has a flipped phase compared to the Pixel devices, requires phase correction after loading
audio_distributed[-1] = -1*audio_distributed[-1]

# Application of the unaltered pipeline

By default, the pipeline can be initialized and applied to a multi-channel audio signal to obtain the diarization estimate. Here, the spatial segmentation is parametrized for a compact microphone array, and a ResNet34-based embedding extractor followed by HDBSCAN clustering is used for the diarization.

In [None]:
ordinary_dia_pipeline = SpatioSpectralDiarizationPipeline()
diarization_estimate = ordinary_dia_pipeline(audio_compact)

By default, the diarization estimate is in form of a dictionary containing ArrayIntervals for each speaker, which provide an efficient realization of booelan arrays by only storing the start and end indices of the segments.


In [None]:
print(diarization_estimate['diarization_estimate'])

ArrayIntervals can be transformed back into boolean numpy arrays by slicing, i.e.

   `dia_array_spk1 = diarization_estimate['diarization_estimate'][0][:600*16_000]`

To export the diarization estimate to an .rttm format, you can use the function `pb.array.interval.to_rttm(diarization_estimate['diarization_estimate'], target_path)`

In [None]:
pb.array.interval.to_rttm(diarization_estimate['diarization_estimate'], 'dia_estimate_compact.rttm')

In [None]:
with pb.visualization.axes_context(fig_size=(10, 5), columns=2) as ax:
    pb.visualization.plot.activity(
        diarization_estimate['diarization_estimate'], ax=ax.new, title='Diarization Estimate for Compact Microphone Array')
    pb.visualization.plot.activity(
        diarization_estimate['diarization_estimate'], ax=ax.new, title='Diarization Target for the dummy signal')

# Application in a distributed microphone setup

## Differences to the compact setup
Compared to an application in a compact microphone setup, two major differences need to be taken into account when applying the pipeline to a distributed microphone setup:
 * The audio signals need to be synchronized before applying the diarization system
  * The TDOAs become significantly larger due to the larger inter-microphone distances, which requires a modification of the TDOA estimation and segmentation parameters.

## Synchronization

In [None]:
# Synchronization of the distributed signals is required for the diarization pipeline to work correctly. Here we use SRO (sampling rate offset) compensation with Dynamic WACD  via the 'paderwasn' package

from spatiospectral_diarization.sro_compensation.sync import estimate_sros, compensate_for_sros

sros = estimate_sros(audio_distributed)
synchronized_audios_distributed = compensate_for_sros(audio_distributed, sros)

## Application with changed segmentation parameters

In [None]:
# Modifying the segmentation parameters for a distributed microphone setup
tdoa_settings = {
    'max_diff': 2,  # Threshold parameter of delays over a closed microphone loop, larger differences are discarded since physical correc tvalue is 0
    'search_range': 200,  # Maximal delay in samples for TDOA estimation that can be tracked
    'f_min':125,  # Minimum frequency for TDOA estimation
    'f_max': 3500,  # Maximum frequency for TDOA estimation (becomes uninformative for higher frequencies in distributed setup)
    'distributed': True,  # Flag to indicate that the TDOA estimation is performed in a distributed setup, only needed if only spatial dia is performed
}
segmentation_settings= {'max_dist': 2,  # Maximal difference in samples of TDOA vectors to be considered from the same segment
                        'peak_ratio_th':.5,
                        'max_temp_dist': 16 # Maximal temporal distance between two TDOA vectors s.t. they can belong to the same segment
                        }

distributed_pipeline = SpatioSpectralDiarizationPipeline(
    segmentation_settings=segmentation_settings,
    tdoa_settings=tdoa_settings
)
diarization_estimate = distributed_pipeline(synchronized_audios_distributed)


In [None]:
with pb.visualization.axes_context(fig_size=(10, 5), columns=2) as ax:
    pb.visualization.plot.activity(
        diarization_estimate['diarization_estimate'], ax=ax.new, title='Diarization Estimate for distributed Microphone Array')
    pb.visualization.plot.activity(
        diarization_estimate['diarization_estimate'], ax=ax.new, title='Diarization Target for the dummy signal')

# Exchanging Modules of the Pipeline

By default, all components of the pipeline can be exchanged without any large overhead. However, since the core aspects lie in the spatial segmentation and segment-level beamforming, the typical use-case should lie in exchanging the VAD, the embedding extractor, or the clustering stage.

These modules are structured in a way, that the pipeline class expects a callable object (e.g. a class or function) for each of these components.

In the following, we provide some mock classes on how a modified VAD, embedding extractor or clusering might look like.

### Example VAD module

In [None]:
def my_new_vad(recording):
    """
    A mock VAD module that returns a channel-wise VAD with frame-resolution matching the STFT for the GCC computation.
    Args:
        recording (np.ndarray): The input audio recording with shape (num_channels, num_samples).

    Returns:
        vad_estimate (np.ndarray): A boolean array of shape (num_channels, num_samples) indicating the VAD decision for each samples.
    """

    # Very simplistic energy-based VAD tracking the average power of the recording and thresholding on this value
    vad_estimate = np.zeros_like(recording)
    for ch_idx, signal in enumerate(recording):
        avg_power = np.mean(signal ** 2)
        threshold = avg_power * 0.5  # Set threshold to 50% of the avg power
        sample_power = signal ** 2
        vad_estimate[ch_idx,:] = sample_power > threshold
    return vad_estimate

### Example Embedding Extractor

In [None]:
import torch
from speechbrain.inference.speaker import EncoderClassifier

class MyNewEmbeddingExtractor:
    """
    A mock embedding extractor that outputs a speaker embedding from a single audio segment in time domain. In this case, this class simply wraps the call instructions of the ECAPA-TDNN from speechbrain, see https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb
    """
    def __init__(self):
        """
        Simply initializes the pre-trained ECAPA-TDNN model from speechbrain.
        """
        self.embedding_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

    def __call__(self, audio_segment):
        """
        Extracts the speaker embedding for the input recording.
        Args:
            audio_segment (np.ndarray): 1-dimensional audio segment with shape (num_samples,).

        Returns:
            embedding (np.ndarray): E-dimenional speaker embedding for the given audio segment, shape(E,).
        """
        # Convert the numpy array to a torch tensor
        recording_tensor = torch.tensor(audio_segment, dtype=torch.float32)
        # Extract the embeddings using the speechbrain model
        with torch.no_grad():
            embedding = self.embedding_extractor.encode_batch(recording_tensor)
            embedding = torch.mean(embedding, dim=0)  # Average over the batch dimension
        embedding = embedding.detach().numpy()  # Convert back to numpy array
        return embedding

### Example Clustering Module

In [None]:
def my_new_clustering(embeddings, seg_boundaries):
    """
    A mock clustering module that performs clustering on the given embeddings.
    Args:
        embeddings (np.ndarray): An array of shape (num_segments, embedding_dim) containing the speaker embeddings of all segments
        seg_boundaries (list): The segment_boundaries (start, end) of all segments. Can be used jointly with the embeddings, e.g. for importance weighting of embeddings, or to discard too short segments for clustering

    Returns:
        labels (np.ndarray): An array of shape (num_segments,) containing the cluster labels for each segment.
        segment_boundaries_new (list): The segment boundaries of the segments after clusering. Allows discarding and modification of segments during clustering
        embeddings_new (np.ndarray): The embeddings of the segments after clustering. Allows discarding and modification of segments during clustering
    """
    # For simplicity, we use a dummy clustering method that assigns all segments to a  random cluster
    num_clusters = 8
    labels = np.random.randint(0, num_clusters, size=embeddings.shape[0])
    # Since no segments are discarded or manipulated, the input segments ar simply the output
    return labels, seg_boundaries, embeddings

### Constructing and applying the new custom pipeline

In [None]:
custom_pipeline = SpatioSpectralDiarizationPipeline(
    vad_module=my_new_vad,
    embedding_extractor=MyNewEmbeddingExtractor(),
    clustering=my_new_clustering,  # or any other clustering method
)
custom_diarization_estimate = custom_pipeline(audio_compact)