# Setup

In [1]:
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from pathlib import Path
import os
from tqdm import tqdm
import torch
import soundfile as sf
import numpy as np
import torchaudio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Embedding Pipeline (Skip if already done)

In [3]:
# control train dataset is in data/train/train.csv
# test dataset is in data/test/test.csv
# eval dataset is in data/eval/eval.csv
# real train dataset is in data/train_small/train_small.csv
# synthetic train dataset is in data/synthetic_data/synthetic_data.csv
# alternative big dataset tbd

# load all datasets
dfs = {
    #"train": pd.read_csv('data/train/train.csv'),
    #"test": pd.read_csv('data/test/test.csv'),
    #"eval": pd.read_csv('data/eval/eval.csv'),
    #"train_small": pd.read_csv('data/train_small/train_small.csv'),
    #"synthetic_data": pd.read_csv('data/synthetic_data/synthetic_data.csv')
    "iemocap": pd.read_csv('data/iemocap/iemocap.csv')
}

dirs = {
  "train": "data/train/",
  "test": "data/test/",
  "eval": "data/eval/",
  "train_small": "data/train_small/",
  "synthetic_data": "data/synthetic_data/",
  "iemocap": "data/iemocap/"
}

In [4]:
# Load pretrained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)



In [5]:
def read_audio(waveforms_obj, enforce_mono=True):
    """General audio loading, based on a custom notation.

    Expected use case is in conjunction with Datasets
    specified by JSON.

    The parameter may just be a path to a file:
    `read_audio("/path/to/wav1.wav")`

    Alternatively, you can specify more options in a dict, e.g.:
    ```
    # load a file from sample 8000 through 15999
    read_audio({
        "file": "/path/to/wav2.wav",
        "start": 8000,
        "stop": 16000
    })
    ```

    Which codecs are supported depends on your torchaudio backend.
    Refer to `torchaudio.load` documentation for further details.

    Arguments
    ---------
    waveforms_obj : str, dict
        Path to audio or dict with the desired configuration.

        Keys for the dict variant:
        - `"file"` (str): Path to the audio file.
        - `"start"` (int, optional): The first sample to load.
        If unspecified, load from the very first frame.
        - `"stop"` (int, optional): The last sample to load (exclusive).
        If unspecified or equal to start, load from `start` to the end.
        Will not fail if `stop` is past the sample count of the file and will
        return less frames.
    enforce_mono : bool, optional
        If True, convert multi-channel audio to mono by averaging across channels.
        Defaults to True.

    Returns
    -------
    torch.Tensor
        1-channel: audio tensor with shape: `(samples, )`.
        >=2-channels: audio tensor with shape: `(samples, channels)`.

    Example
    -------
    >>> dummywav = torch.rand(16000)
    >>> import os
    >>> tmpfile = str(getfixture('tmpdir') / "wave.wav")
    >>> write_audio(tmpfile, dummywav, 16000)
    >>> asr_example = { "wav": tmpfile, "spk_id": "foo", "words": "foo bar"}
    >>> loaded = read_audio(asr_example["wav"])
    >>> loaded.allclose(dummywav.squeeze(0),atol=1e-4) # replace with eq with sox_io backend
    True
    """
    if isinstance(waveforms_obj, str):
        audio, _ = torchaudio.load(waveforms_obj)
    else:
        path = waveforms_obj["file"]
        start = waveforms_obj.get("start", 0)
        # To match past SB behavior, `start == stop` or omitted `stop` means to
        # load all frames from `start` to the file end.
        stop = waveforms_obj.get("stop", start)

        if start < 0:
            raise ValueError(
                f"Invalid sample range (start < 0): {start}..{stop}!"
            )

        if stop < start:
            # Could occur if the user tried one of two things:
            # - specify a negative value as an attempt to index from the end;
            # - specify -1 as an attempt to load up to the last sample.
            raise ValueError(
                f"Invalid sample range (stop < start): {start}..{stop}!\n"
                'Hint: Omit "stop" if you want to read to the end of file.'
            )

        # Requested to load until a specific frame?
        if start != stop:
            num_frames = stop - start
            audio, fs = torchaudio.load(
                path, num_frames=num_frames, frame_offset=start
            )
        else:
            # Load to the end.
            audio, fs = torchaudio.load(path, frame_offset=start)

    # Convert multi-channel audio to mono by averaging across channels if needed
    if audio.shape[0] > 1 and enforce_mono:
        audio = torch.mean(audio, dim=0, keepdim=True)

    audio = audio.transpose(0, 1)
    speech = audio.squeeze(1)

    # Resample audio to 16kHz if necessary
    _, sample_rate = sf.read(waveforms_obj)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        speech = resampler(torch.tensor(speech, dtype=torch.float32))  # Ensure float32 type
    else:
        speech = torch.tensor(speech, dtype=torch.float32)  # Ensure float32 type

    return speech

In [6]:
def get_wav2vec_embedding(input):
    # Preprocess the audio
    inputs = processor(input, sampling_rate=16000, return_tensors="pt", padding=True).to(device)

    # Extract embeddings
    with torch.no_grad():
        outputs = wav2vec(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling to get fixed-size embedding

    return embeddings

In [7]:
def process(df, dir_path="", device="cpu"):
    num_files = len(df)

    # Initialize an empty list to store embeddings
    all_embeddings = []

    for index in tqdm(range(num_files), desc="Processing files"):
        # Read and process a single audio file
        file_path = dir_path + df.loc[index, 'filename']
        input_audio = torch.tensor(read_audio(file_path), dtype=torch.float32).to(device)

        # Pass the audio to Wav2Vec model
        with torch.no_grad():
            embedding = get_wav2vec_embedding(input_audio)  # Assuming it returns a single embedding tensor

        # Append the result to the list
        all_embeddings.append(embedding.cpu().numpy())

    # Convert list of embeddings to a PyTorch tensor
    all_embeddings = torch.tensor(all_embeddings, dtype=torch.float32, device=device)

    # Normalize all embeddings (mean = 0, variance = 1) across all dimensions
    mean = all_embeddings.mean(dim=0, keepdim=True)
    std = all_embeddings.std(dim=0, keepdim=True) + 1e-8  # Avoid division by zero
    all_embeddings = (all_embeddings - mean) / std

    # Move embeddings back to CPU and write to DataFrame
    df['embedding'] = all_embeddings.cpu().numpy().tolist()

    return df

In [8]:
# for all wav files in each dataset, get the embeddings for each dataset
#normalize the mean and variance of the embeddings for each dataset so that the mean is 0 and the variance is 1

for key in dfs:
    df = dfs[key]
    dir_path = dirs[key]
    processed_df = process(df, dir_path, device)
    processed_df.to_csv(f'{key}_embeddings.csv', index=False)
    dfs[key] = processed_df


  speech = torch.tensor(speech, dtype=torch.float32)  # Ensure float32 type
  input_audio = torch.tensor(read_audio(file_path), dtype=torch.float32).to(device)
Processing files: 100%|██████████| 4597/4597 [02:06<00:00, 36.46it/s]
  all_embeddings = torch.tensor(all_embeddings, dtype=torch.float32, device=device)


## Load Embedding CSVs (if embedding pipeline already ran)

In [9]:
for key in dfs:
    dfs[key] = pd.read_csv(f'{key}_embeddings.csv')

dfs["train"]

KeyError: 'train'

# Information Theoretical Analysis

## Avoiding Estimating Probability Distributions

In [None]:
# compute MMD between the embeddings of each dataset
def gaussian_kernel(x, y, sigma=1.0):
    """
    Compute the Gaussian kernel between x and y.

    Args:
        x (np.ndarray): Array of shape (n_samples, embedding_dim).
        y (np.ndarray): Array of shape (m_samples, embedding_dim).
        sigma (float): Bandwidth of the Gaussian kernel.

    Returns:
        np.ndarray: Kernel matrix of shape (n_samples, m_samples).
    """
    pairwise_dists = cdist(x, y, 'sqeuclidean')  # Squared Euclidean distances
    return np.exp(-pairwise_dists / (2 * sigma ** 2))

def compute_mmd(X, Y, sigma=1.0):
    """
    Compute the Maximum Mean Discrepancy (MMD) between two distributions.

    Args:
        X (np.ndarray): Samples from the first distribution (n_samples, embedding_dim).
        Y (np.ndarray): Samples from the second distribution (m_samples, embedding_dim).
        sigma (float): Bandwidth of the Gaussian kernel.

    Returns:
        float: MMD^2 value.
    """
    n, m = len(X), len(Y)

    # Compute kernel matrices
    K_xx = gaussian_kernel(X, X, sigma)
    K_yy = gaussian_kernel(Y, Y, sigma)
    K_xy = gaussian_kernel(X, Y, sigma)

    # Compute MMD^2
    mmd = (
        np.sum(K_xx) / (n * (n - 1))  # Exclude diagonal for unbiased estimate
        + np.sum(K_yy) / (m * (m - 1))
        - 2 * np.sum(K_xy) / (n * m)
    )

    return mmd



In [None]:
# Kolmogorov-Smirnov test between the embeddings of each dataset

In [None]:
# Wasserstein distance between the embeddings of each dataset

## Estimating Probability Distributions

In [None]:
# Jensen-Shannon divergence between the embeddings of each dataset

In [None]:
# Bhattacharyya distance between the embeddings of each dataset