# Setup

In [1]:
!git clone https://github.com/ewsiegel/small-audio-data-aug.git

Cloning into 'small-audio-data-aug'...
remote: Enumerating objects: 939, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 939 (delta 40), reused 96 (delta 40), pack-reused 842 (from 1)[K
Receiving objects: 100% (939/939), 94.81 MiB | 6.42 MiB/s, done.
Resolving deltas: 100% (108/108), done.
Updating files: 100% (727/727), done.
Filtering content: 100% (2/2), 322.80 MiB | 31.16 MiB/s, done.


In [2]:
!pip install POT

Collecting POT
  Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (865 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/865.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/865.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.6/865.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5


In [3]:
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from pathlib import Path
import os
from tqdm import tqdm
import torch
import soundfile as sf
import numpy as np
import torchaudio
import ot

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [5]:
dfs = {
    "train": None,
    "test": None,
    "eval": None,
    "train_small": None,
    "synthetic_data": None,
    "iemocap": None
}

## Load Embedding CSVs (if embedding pipeline already ran)

In [6]:
def convert_embedding_to_array(embedding_str):
    return np.array([[float(x) for x in embedding_str[2:-2].split(',')]])

In [7]:
for key in dfs:
    dfs[key] = pd.read_csv(f'small-audio-data-aug/{key}_embeddings.csv')
    dfs[key]['embedding'] = dfs[key]['embedding'].apply(convert_embedding_to_array)
    dfs[key]['embedding_size'] = dfs[key]['embedding'].apply(lambda x: x.shape)
    dfs[key] = dfs[key].assign(
    max_value = dfs[key]['embedding'].apply(lambda x: np.max(x)),
    min_value = dfs[key]['embedding'].apply(lambda x: np.min(x))
    )

    if key != "iemocap":
      dfs[key].drop(["Dialogue_ID", "Utterance_ID"], axis=1, inplace=True)

In [None]:
dfs['train']

Unnamed: 0,Emotion,Utterance,filename,embedding,embedding_size,max_value,min_value
0,anger,"Oh no-no-no, give me some specifics.",dia6_utt5.wav,"[[0.44665947556495667, -1.11380934715271, 1.62...","(1, 768)",2.468270,-2.459289
1,anger,You fell asleep!!,dia6_utt9.wav,"[[-0.7317129969596863, 1.0562498569488525, -1....","(1, 768)",4.007577,-3.071571
2,anger,There was no kangaroo!,dia6_utt10.wav,"[[-1.2240828275680542, 0.29620447754859924, -0...","(1, 768)",2.496515,-3.160261
3,anger,They didnt take any of my suggestions!,dia6_utt11.wav,"[[0.5159741640090942, -0.4054679572582245, 0.9...","(1, 768)",2.592684,-2.452389
4,anger,This guy fell asleep!,dia6_utt18.wav,"[[-1.4163166284561157, 1.530881643295288, 0.11...","(1, 768)",3.198749,-2.332248
...,...,...,...,...,...,...,...
9444,surprise,You're kidding right?,dia1036_utt4.wav,"[[0.46727198362350464, -0.615651547908783, 0.9...","(1, 768)",3.078316,-2.628455
9445,surprise,What?!,dia1036_utt6.wav,"[[0.7963269948959351, -1.2791682481765747, 0.3...","(1, 768)",2.263556,-2.662967
9446,surprise,"Dont tell me, because of the big nostril thing?",dia1038_utt4.wav,"[[-0.8200936317443848, 1.1993712186813354, -0....","(1, 768)",1.997519,-2.180758
9447,surprise,How many perfectly fine women are you gonna re...,dia1038_utt8.wav,"[[-1.1827545166015625, -0.12241476029157639, 1...","(1, 768)",1.859099,-2.206651


In [None]:
dfs['test']

Unnamed: 0,Emotion,Utterance,filename,embedding,embedding_size,max_value,min_value
0,anger,Oh. Thats so Monica can keep track. That way ...,dia0_utt1.wav,"[[-0.5085744261741638, -0.8337048292160034, 1....","(1, 768)",1.960479,-2.304053
1,anger,"Okay, y'know what? There is no more left, left!",dia2_utt4.wav,"[[0.3926408588886261, -0.05954478308558464, -1...","(1, 768)",2.344693,-2.355750
2,anger,"Oh okay, lift it straight up over your head!",dia2_utt5.wav,"[[-0.6634398102760315, -0.5530644655227661, -0...","(1, 768)",2.754818,-1.969784
3,anger,Straight up over your head!,final_videos_testdia2_utt6.wav,"[[-0.39111119508743286, 0.8370274901390076, -0...","(1, 768)",2.283523,-2.997050
4,anger,"Okay, fine, whatever. Welcome to the building.",dia3_utt4.wav,"[[-0.5120477676391602, -0.2375807911157608, -0...","(1, 768)",1.949899,-1.640827
...,...,...,...,...,...,...,...
2487,surprise,"Ah, he cant make it, he said he had to his......",dia273_utt1.wav,"[[0.6856644153594971, 0.5390406847000122, -0.6...","(1, 768)",2.605439,-2.524296
2488,surprise,Oh Willies still alive!,dia278_utt1.wav,"[[2.2445993423461914, -0.661720335483551, 1.61...","(1, 768)",4.956912,-4.582806
2489,surprise,"Thats not true, there are great pictures of us!",dia278_utt5.wav,"[[0.29597780108451843, -0.1121884360909462, 0....","(1, 768)",2.419948,-2.256171
2490,surprise,Oh my God! Thats the creep that youre with a...,dia278_utt7.wav,"[[-1.0072144269943237, 0.016898762434720993, 0...","(1, 768)",3.132073,-2.230540


In [None]:
dfs['eval']

Unnamed: 0,Emotion,Utterance,filename,embedding,embedding_size,max_value,min_value
0,anger,"I-Im sorry, but the moment I touch him, I jus...",dia1_utt11.wav,"[[-0.3840460181236267, -0.7538995742797852, 0....","(1, 768)",1.912862,-1.978784
1,anger,"Ah AhGet out of here! Uh, meeting someone? ...",dia4_utt6.wav,"[[0.7532823085784912, -1.0384068489074707, 0.9...","(1, 768)",3.519065,-2.854726
2,anger,You had no right to tell me you ever had feeli...,dia5_utt0.wav,"[[-1.6826235055923462, 0.17885807156562805, -0...","(1, 768)",1.838689,-1.910871
3,anger,I was doing great with Julie before I found ou...,dia5_utt2.wav,"[[-1.445603370666504, 0.12328189611434937, 0.2...","(1, 768)",1.877653,-1.975783
4,anger,"Hey, I was doin' great before I found out abou...",dia5_utt3.wav,"[[-1.8289591073989868, 0.3589648902416229, 0.2...","(1, 768)",2.583952,-2.251678
...,...,...,...,...,...,...,...
1041,surprise,"Ugh, how can you even ask that question?!",dia107_utt3.wav,"[[-1.272656798362732, 1.3542582988739014, -0.7...","(1, 768)",2.927048,-3.036491
1042,surprise,Ross! Get a shot of this. (He's carrying an is...,dia108_utt0.wav,"[[0.6863281726837158, -0.33641546964645386, 0....","(1, 768)",2.959264,-3.362324
1043,surprise,"This one doesn't hurt eitherOoh, yes it does!",dia108_utt2.wav,"[[-0.06057782843708992, 0.689980149269104, -0....","(1, 768)",2.471241,-2.452653
1044,surprise,Ooh!,dia108_utt7.wav,"[[-0.9258497357368469, -0.07488519698381424, 0...","(1, 768)",5.568173,-4.057945


In [None]:
dfs['train_small']

Unnamed: 0,Emotion,Utterance,filename,embedding,embedding_size,max_value,min_value
0,neutral,What does she mean?,dia546_utt6.wav,"[[0.5915156006813049, -0.7444477677345276, -0....","(1, 768)",2.321511,-3.017940
1,joy,"Yes, but theres two in martini, soo everybody",dia845_utt5.wav,"[[-0.3148207366466522, -0.26443731784820557, 0...","(1, 768)",2.082715,-2.184872
2,sadness,I realize that people are going to be disappoi...,dia996_utt6.wav,"[[-0.8711427450180054, -0.41028016805648804, -...","(1, 768)",1.882271,-2.116921
3,joy,"Hi, I'm Ben. I'm hospital worker Ben. It's Ben...",dia639_utt1.wav,"[[0.35737714171409607, -1.334572672843933, 1.6...","(1, 768)",2.724236,-3.484365
4,anger,Why cant you get a girlfriend your own age?,dia30_utt6.wav,"[[0.29854339361190796, 0.9793126583099365, -0....","(1, 768)",2.932210,-2.765011
...,...,...,...,...,...,...,...
2348,anger,"No, I didn't misunderstand, okay? She was all ...",dia1026_utt13.wav,"[[-1.8772238492965698, 0.2824714183807373, -0....","(1, 768)",1.711957,-1.877224
2349,surprise,"So you might say, the ring is irreplaceable?",dia623_utt16.wav,"[[-1.84028160572052, 0.6637547612190247, -0.48...","(1, 768)",2.464344,-2.068386
2350,neutral,Are you all right?,dia939_utt2.wav,"[[0.3034297227859497, 0.17507365345954895, 0.0...","(1, 768)",3.112333,-2.545152
2351,surprise,There are?,dia163_utt2.wav,"[[1.4636329412460327, 0.7037951946258545, 0.39...","(1, 768)",3.690302,-4.004744


In [None]:
dfs['synthetic_data']

Unnamed: 0,Emotion,Utterance,filename,embedding,embedding_size,max_value,min_value
0,surprise,,synthetic_23d2f8d2-868a-4b6b-9fe7-d3bb2eb8be04...,"[[-0.011522575281560421, -2.038097620010376, -...","(1, 768)",3.498626,-3.085043
1,surprise,,synthetic_ef144e24-a5a0-427d-bb31-c0784050aa38...,"[[0.1717299073934555, 1.1432627439498901, 0.00...","(1, 768)",2.012098,-2.012697
2,surprise,,synthetic_b3abaf3c-bd69-4d7e-815e-5315a5b0c662...,"[[-0.8839215040206909, -1.9413646459579468, 0....","(1, 768)",2.181602,-2.371408
3,surprise,,synthetic_55d22bf8-95a5-4486-929b-805672667ad9...,"[[0.10746016353368759, -0.009100914932787418, ...","(1, 768)",3.655480,-3.715517
4,surprise,,synthetic_4c438864-e0f9-4e07-a6c4-5b5d0661bb1b...,"[[-0.9252136945724487, 0.15469923615455627, 0....","(1, 768)",1.924655,-1.720954
...,...,...,...,...,...,...,...
11760,sadness,,synthetic_fdd52bd1-2746-41a4-9499-e3863d4cbf6e...,"[[-0.4149121344089508, -0.01586376689374447, 0...","(1, 768)",1.605808,-1.388799
11761,sadness,,synthetic_adea23d8-c7f8-48f9-8e8e-05cdc8a7d95e...,"[[0.27845144271850586, -1.073457956314087, -0....","(1, 768)",2.139312,-2.177377
11762,sadness,,synthetic_99f45d17-9bcc-4590-bcde-52604dc907d4...,"[[-0.4005843997001648, 0.3803749084472656, -0....","(1, 768)",1.949383,-2.041683
11763,sadness,,synthetic_e567d932-a202-409d-8f21-ea22c2b216ed...,"[[-0.3698214590549469, 0.2207428365945816, -0....","(1, 768)",2.419673,-2.124777


In [None]:
dfs['iemocap']

Unnamed: 0,filename,Emotion,embedding,embedding_size,max_value,min_value
0,Session1/sentences/wav/Ses01F_script02_1/Ses01...,neutral,"[[0.982848048210144, -0.7211358547210693, 0.32...","(1, 768)",3.167983,-2.882839
1,Session1/sentences/wav/Ses01F_script02_1/Ses01...,surprised,"[[0.21380597352981567, 1.0312399864196777, -2....","(1, 768)",2.237600,-2.443452
2,Session1/sentences/wav/Ses01F_script02_1/Ses01...,neutral,"[[0.30551090836524963, 0.47443634271621704, -0...","(1, 768)",2.385293,-2.942109
3,Session1/sentences/wav/Ses01F_script02_1/Ses01...,anger,"[[-0.681359589099884, 1.39510977268219, 0.5641...","(1, 768)",2.123069,-2.297904
4,Session1/sentences/wav/Ses01F_script02_1/Ses01...,anger,"[[-1.189020037651062, 0.38139694929122925, 0.0...","(1, 768)",2.575806,-2.271996
...,...,...,...,...,...,...
4592,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...,neutral,"[[0.916408896446228, -0.7958894968032837, 0.33...","(1, 768)",5.105549,-3.431727
4593,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...,neutral,"[[1.066419005393982, -0.5263048410415649, 0.11...","(1, 768)",2.077878,-2.229864
4594,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...,neutral,"[[1.92062246799469, -1.2376164197921753, 0.159...","(1, 768)",2.647804,-2.506254
4595,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...,neutral,"[[2.3236637115478516, -1.648442029953003, 0.63...","(1, 768)",3.086173,-3.184543


In [None]:
for key in dfs:
  print(f"{key} min value : {min(dfs[key]['min_value'])}")
  print(f"{key} max value : {max(dfs[key]['max_value'])}")
  print(f"{key} variance : {np.array()}")

train min value : -11.237277030944824
train max value : 9.692025184631348
test min value : -9.51072883605957
test max value : 8.793691635131836
eval min value : -8.543774604797363
eval max value : 8.501014709472656
train_small min value : -9.685197830200195
train_small max value : 9.451836585998535
synthetic_data min value : -10.237051963806152
synthetic_data max value : 10.832383155822754
iemocap min value : -10.985356330871582
iemocap max value : 10.823680877685547


Euclidean Distance is acceptable because all data is within similar bounds (-11,11)

# Information Theoretical Analysis

## Avoiding Estimating Probability Distributions

In [8]:
# compute MMD between the embeddings of each dataset
def gaussian_kernel(x, y, sigma=1.0):
    """
    Compute the Gaussian kernel between x and y.

    Args:
        x (np.ndarray): Array of shape (n_samples, embedding_dim).
        y (np.ndarray): Array of shape (m_samples, embedding_dim).
        sigma (float): Bandwidth of the Gaussian kernel.

    Returns:
        np.ndarray: Kernel matrix of shape (n_samples, m_samples).
    """
    from scipy.spatial import distance
    pairwise_dists = distance.cdist(x, y, 'sqeuclidean')  # Squared Euclidean distances
    return np.exp(-pairwise_dists / (2 * sigma ** 2))

def compute_mmd(X, Y, sigma=1.0):
    """
    Compute the Maximum Mean Discrepancy (MMD) between two distributions.

    Args:
        X (np.ndarray): Samples from the first distribution (n_samples, embedding_dim).
        Y (np.ndarray): Samples from the second distribution (m_samples, embedding_dim).
        sigma (float): Bandwidth of the Gaussian kernel.

    Returns:
        float: MMD^2 value.
    """
    n, m = len(X), len(Y)

    # Compute kernel matrices
    K_xx = gaussian_kernel(X, X, sigma)
    K_yy = gaussian_kernel(Y, Y, sigma)
    K_xy = gaussian_kernel(X, Y, sigma)

    # Compute MMD^2
    mmd = (
        np.sum(K_xx) / (n * (n - 1))  # Exclude diagonal for unbiased estimate
        + np.sum(K_yy) / (m * (m - 1))
        - 2 * np.sum(K_xy) / (n * m)
    )

    return mmd

In [9]:
reference_embedding = dfs['train_small']['embedding']

# comparison datasets
iemocap_embeddings = dfs['iemocap']['embedding']
synthetic_embeddings = dfs['synthetic_data']['embedding']
test_embeddings = dfs['test']['embedding']
eval_embeddings = dfs['eval']['embedding']
train_embeddings = dfs['train']['embedding']

def to_matrix(embedding_col):
  # n x 1 x 768
  # print(embedding_col[0].shape)
  return np.array([embedding_col[0][0] for row in embedding_col])

reference_matrix = to_matrix(reference_embedding)
iemocap_matrix = to_matrix(iemocap_embeddings)
synthetic_matrix = to_matrix(synthetic_embeddings)
test_matrix = to_matrix(test_embeddings)
eval_matrix = to_matrix(eval_embeddings)
train_matrix = to_matrix(train_embeddings)

In [10]:
# Wasserstein distance between the embeddings of each dataset

# TODO: look into using wasserstein distance as loss metric.


import ot  # Install via pip: pip install POT

def wasserstein_distance(ref_matrix, cmp_matrix):
  # Assume friends_embeddings and online_embeddings are numpy arrays of shape (n_samples, n_features)
  # Compute empirical distributions
  P = np.ones(len(ref_matrix)) / len(ref_matrix)
  Q = np.ones(len(cmp_matrix)) / len(cmp_matrix)
  # Compute cost matrix (e.g., Euclidean distance)
  cost_matrix = ot.dist(ref_matrix, cmp_matrix, metric='euclidean')

  # Compute Wasserstein Distance (EMD)
  wasserstein_distance = ot.emd2(P, Q, cost_matrix)

  return wasserstein_distance


In [11]:
print(f"Wasserstein distance between reference and iemocap embeddings: {wasserstein_distance(reference_matrix, iemocap_matrix)}")
print(f"Wasserstein distance between reference and synthetic embeddings: {wasserstein_distance(reference_matrix, synthetic_matrix)}")
print(f"Wasserstein distance between reference and test embeddings: {wasserstein_distance(reference_matrix, test_matrix)}")
print(f"Wasserstein distance between reference and eval embeddings: {wasserstein_distance(reference_matrix, eval_matrix)}")
print(f"Wasserstein distance between reference and train embeddings: {wasserstein_distance(reference_matrix, train_matrix)}")

Wasserstein distance between reference and iemocap embeddings: 39.29992985062456
Wasserstein distance between reference and synthetic embeddings: 36.64593145113672
Wasserstein distance between reference and test embeddings: 28.977975201102275
Wasserstein distance between reference and eval embeddings: 28.14082181803641
Wasserstein distance between reference and train embeddings: 22.209906232458987


In [12]:
print(f"MMD between reference and iemocap embeddings: {compute_mmd(reference_matrix, iemocap_matrix)}")
print(f"MMD between reference and synthetic embeddings: {compute_mmd(reference_matrix, synthetic_matrix)}")
print(f"MMD between reference and test embeddings: {compute_mmd(reference_matrix, test_matrix)}")
print(f"MMD between reference and eval embeddings: {compute_mmd(reference_matrix, eval_matrix)}")
print(f"MMD between reference and train embeddings: {compute_mmd(reference_matrix, train_matrix)}")

MMD between reference and iemocap embeddings: 2.0006427505728137
MMD between reference and synthetic embeddings: 2.0005101751683334
MMD between reference and test embeddings: 2.000826615270757
MMD between reference and eval embeddings: 2.0013821078670704
MMD between reference and train embeddings: 2.0005310125743776


In [None]:
# Pasted from GPT. Integrate/interpret (TODO). t-SNE maps back down to 2d-3d for visualization purposes.
# Visualization with t-SNE
# combined_embeddings = np.vstack((friends_embeddings, online_embeddings))
# combined_labels = ['Friends'] * n_friends + ['Online'] * n_online

# tsne = TSNE(n_components=2, random_state=42)
# embeddings_2d = tsne.fit_transform(combined_embeddings)

# plt.figure(figsize=(10, 7))
# for label in set(combined_labels):
#     idx = [i for i, l in enumerate(combined_labels) if l == label]
#     plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1], label=label, alpha=0.5)
# plt.legend()
# plt.title("t-SNE Visualization of Embeddings")
# plt.show()

## Estimating Probability Distributions

 Jensen-Shannon (JS) Divergence and Bhattacharyya Distance
JS Divergence:
Symmetric: Unlike KL divergence, it's symmetric.
Bounded: Values range between 0 and 1, making interpretation easier.
Bhattacharyya Distance:
Measures Overlap: Quantifies the overlap between two statistical samples or populations.
Why Not Ideal Here:
Requires Probability Distributions: Need to estimate PDFs from embeddings, which can be challenging with high-dimensional data.
Methods:
Adversarial Training: Train a discriminator to distinguish between source and target embeddings while training the feature extractor to confuse the discriminator.
MMD-Based Adaptation: Incorporate MMD as a regularizer in your loss function to minimize distributional differences.


In [None]:
# Jensen-Shannon divergence between the embeddings of each dataset

# NOT RECOMMENDED

In [None]:
# Bhattacharyya distance between the embeddings of each dataset
# NOT RECOMMENDED
# def Bhattacharyya_distance():

