In [33]:
import joblib
import pandas as pd
import csv
import numpy as np
from itertools import zip_longest
import torch
from pathlib import Path
from tqdm import tqdm
from typing import Any, Dict, List, Optional, Tuple, Union
pd.set_option('display.max_rows', 200)

In [28]:
experiment_path = "../speech_hypertuning/experiments/create_splits/random/"
state = joblib.load(experiment_path + '/state.pkl')
df = state['dataset_metadata']

In [29]:
# Sort by audio count intercalated by gender
sid_to_audios_count = df.groupby(['speaker_id']).size().to_dict()
sid_to_gender = (
    df[['speaker_id', 'Gender']]
    .drop_duplicates()
    .set_index("speaker_id")
    .to_dict()["Gender"]
)
sorted_sid_by_audios_count = [
    sid
    for sid, _ in sorted(
        sid_to_audios_count.items(), key=lambda tuple: tuple[1], reverse=True
    )
]

sorted_male_sid_by_audios_count = [
    sid for sid in sorted_sid_by_audios_count if sid_to_gender[sid] == "m"
]
sorted_female_sid_by_audios_count = [
    sid for sid in sorted_sid_by_audios_count if sid_to_gender[sid] == "f"
]

alternated_list = []
for male_sid, female_sid in zip_longest(
    sorted_male_sid_by_audios_count, sorted_female_sid_by_audios_count
):
    if male_sid is not None:
        alternated_list.append(male_sid)
    if female_sid is not None:
        alternated_list.append(female_sid)

## Naive

In [36]:
def process_sample_sizes(
    proportions: Dict[str, Union[int, float]],
    speaker_df: pd.DataFrame,
) -> Dict[str, int]:
    sample_sizes = {}
    for partition, v in proportions.items():
        if isinstance(v, float):
            sample_size = int(len(speaker_df) * v)
        elif isinstance(v, int):
            sample_size = int(v)
        else:
            raise ValueError(f"Unsoported value in proportions for partition {partition}: {v}")
        sample_sizes[partition] = sample_size
    return sample_sizes

def sort_idx_alternating_video_ids(
    speaker_df: pd.DataFrame,
    sampled_idxs: np.ndarray,
    sample_size: int,
) -> List[int]:
    """
    Sort indexes alternating one from each video id
    The reason of this is to take rows in order when subsampling,
    as having different videos is more valuable than multiple
    segments from the same video_id
    """
    sampled_idxs_order = []
    unorder_speaker_partition_df = speaker_df.loc[sampled_idxs].copy()
    while len(sampled_idxs_order) < sample_size:
        for video_id in unorder_speaker_partition_df.video_id.unique():
            df_to_choose = unorder_speaker_partition_df[(unorder_speaker_partition_df.video_id == video_id) & (~unorder_speaker_partition_df.index.isin(sampled_idxs_order))]
            if df_to_choose.empty:
                continue
            idx = np.random.choice(
                a=df_to_choose.index, size=1, replace=False
            )[0]
            sampled_idxs_order.append(idx)
    return sampled_idxs_order

In [53]:
df = state['dataset_metadata'].copy()
dfs = []
for speaker_id in tqdm(alternated_list):
    # Process sample sizes for speaker id once, because the df will get modified
    speaker_df = df[df.speaker_id == speaker_id].copy()
    sample_sizes = process_sample_sizes(proportions=proportions, speaker_df=speaker_df)

    for partition, sample_size in sample_sizes.items():
        if partition != remainder_k:
            speaker_df = df[df.speaker_id == speaker_id].copy()
            speaker_df.drop("Set", axis=1, inplace=True)

            sampled_idxs = np.random.choice(
                a=speaker_df.index, size=sample_size, replace=False
            )

            sampled_idxs_order = sort_idx_alternating_video_ids(speaker_df=speaker_df, sampled_idxs=sampled_idxs, sample_size=sample_size)

            assert len(sampled_idxs) == len(sampled_idxs_order) == sample_size

            speaker_partition_df = speaker_df.loc[sampled_idxs_order]

            df = df[~df.index.isin(speaker_partition_df.index)]  # Remove chosen
            speaker_partition_df['set'] = partition
            dfs.append(speaker_partition_df)


    if remainder_k is not None:
        speaker_df = df[df.speaker_id == speaker_id].copy() # Remaining speaker df
        sampled_idxs_order = sort_idx_alternating_video_ids(speaker_df=speaker_df, sampled_idxs=speaker_df.index, sample_size=len(speaker_df))
        speaker_partition_df = speaker_df.loc[sampled_idxs_order]
        speaker_partition_df.drop("Set", axis=1, inplace=True)
        speaker_partition_df['set'] = remainder_k
        df = df[~df.index.isin(speaker_partition_df.index)]  # Remove chosen
        dfs.append(speaker_partition_df)

splits_df = pd.concat(dfs)

 ... (more hidden) ...


In [54]:
assert len(splits_df) == len(state['dataset_metadata'])

In [55]:
df = pd.read_csv("/home/eernst/Voxceleb1/splits.csv")
df

Unnamed: 0.1,Unnamed: 0,index,filename,sr,channels,frames,duration,speaker_id,video_id,segment_id,VGGFace1 ID,Gender,Nationality,dataset,rel_path,set
0,141482,141482,/datasets/Voxceleb1/wav/id10986/Gq881M1WnKM/00...,16000,1,111361,6.960063,id10986,Gq881M1WnKM,3,Rob_Reiner,m,USA,voxceleb1,wav/id10986/Gq881M1WnKM/00003.wav,validation
1,141696,141696,/datasets/Voxceleb1/wav/id10986/6SBGrr4iFp0/00...,16000,1,65921,4.120063,id10986,6SBGrr4iFp0,30,Rob_Reiner,m,USA,voxceleb1,wav/id10986/6SBGrr4iFp0/00030.wav,validation
2,141820,141820,/datasets/Voxceleb1/wav/id10986/lFEBeMFuL_M/00...,16000,1,88961,5.560062,id10986,lFEBeMFuL_M,17,Rob_Reiner,m,USA,voxceleb1,wav/id10986/lFEBeMFuL_M/00017.wav,validation
3,141645,141645,/datasets/Voxceleb1/wav/id10986/KH-yJAsKo1Q/00...,16000,1,553601,34.600062,id10986,KH-yJAsKo1Q,12,Rob_Reiner,m,USA,voxceleb1,wav/id10986/KH-yJAsKo1Q/00012.wav,validation
4,141389,141389,/datasets/Voxceleb1/wav/id10986/sOlWHKGy-SY/00...,16000,1,144641,9.040062,id10986,sOlWHKGy-SY,66,Rob_Reiner,m,USA,voxceleb1,wav/id10986/sOlWHKGy-SY/00066.wav,validation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153511,138942,138942,/datasets/Voxceleb1/wav/id11229/zO6sDDC8jDw/00...,16000,1,120961,7.560062,id11229,zO6sDDC8jDw,7,Will_Mellor,m,UK,voxceleb1,wav/id11229/zO6sDDC8jDw/00007.wav,train
153512,138911,138911,/datasets/Voxceleb1/wav/id11229/hMMde0Vtg-M/00...,16000,1,136321,8.520062,id11229,hMMde0Vtg-M,2,Will_Mellor,m,UK,voxceleb1,wav/id11229/hMMde0Vtg-M/00002.wav,train
153513,138922,138922,/datasets/Voxceleb1/wav/id11229/AiJinimEdKA/00...,16000,1,299521,18.720063,id11229,AiJinimEdKA,1,Will_Mellor,m,UK,voxceleb1,wav/id11229/AiJinimEdKA/00001.wav,train
153514,138943,138943,/datasets/Voxceleb1/wav/id11229/zO6sDDC8jDw/00...,16000,1,126081,7.880063,id11229,zO6sDDC8jDw,4,Will_Mellor,m,UK,voxceleb1,wav/id11229/zO6sDDC8jDw/00004.wav,train


In [56]:
len(state['dataset_metadata'])

153516

In [25]:
df4.groupby(['speaker_id']).size().describe()

count     100.000000
mean      308.860000
std       105.420564
min       216.000000
25%       254.000000
50%       283.000000
75%       330.500000
max      1002.000000
dtype: float64

In [9]:
speaker_df = df.loc[sampled_idxs][df.speaker_id == "id10986"].copy()
d = speaker_df.groupby(['video_id']).size().to_dict()

  speaker_df = df.loc[sampled_idxs][df.speaker_id == "id10986"].copy()


In [10]:
len(d)

23

In [11]:
d

{'33NaEPsiFEA': 2,
 '4p9Fm05hlIM': 3,
 '6SBGrr4iFp0': 11,
 '79oWAvMcAm0': 1,
 'GPPBWbOC1FM': 3,
 'Gq881M1WnKM': 3,
 'Hyck_yynhF4': 9,
 'KH-yJAsKo1Q': 8,
 'KZVJwFblEUo': 1,
 'Pqwj7Sk4Qyw': 1,
 'SfYt6Y6vrOI': 1,
 'Vu__sFrtFfY': 1,
 'Vzqmi3MAixQ': 15,
 'XaXt-hdv2Js': 1,
 'd8H0ZdING7Y': 1,
 'eWvntayRfe8': 15,
 'fFnHb9jzYwU': 1,
 'lFEBeMFuL_M': 15,
 'sOlWHKGy-SY': 15,
 'tj0Mw2xTE74': 3,
 'w7DmOHDIMQ8': 16,
 'wJ_YINYHBwY': 8,
 'xFTIClN5z70': 16}

## Heuristic to separate video_ids for validation and test

In [36]:
df = state['dataset_metadata'].copy()
proportions = {"train": 0.75, "validation": 0.15, "test": 0.15}
RETRIES_DEFAULT = 30
dfs = []
extra_audios_list = [i for i in range(10)] + [i for i in range(-1, -6, -1)]
extra_audios_i = 0
np.random.seed(42)
for speaker_id in alternated_list:
    sample_sizes = {}
    speaker_df = df[df.speaker_id == speaker_id].copy()
    for partition, v in proportions.items():
        sample_sizes[partition] = int(len(speaker_df) * v)
    
    for partition, v in proportions.items():
        if partition != "train":
            print((speaker_id, partition))
            speaker_df = df[df.speaker_id == speaker_id].copy()
            speaker_df.drop("Set", axis=1, inplace=True)

            sample_size = sample_sizes[partition]
            # Get group of video ids that sum sample_size + extra_audios (desirably extra_audios=0)
            # This is a naive approach to the knapsack problem
            video_id_to_audios_count = speaker_df.groupby(['video_id']).size().to_dict()
            print(video_id_to_audios_count)
            extra_audios_i = 0
            retries = RETRIES_DEFAULT
            found_group_of_videos = False
            while not found_group_of_videos:
                chosen_videos = []
                videos_sum = 0
                print(f"RETRY: {retries}")
                extra_audios = extra_audios_list[extra_audios_i]
                while videos_sum < sample_size + extra_audios:
                    remaining_audios = sample_size + extra_audios - videos_sum
                    print(f"remaining_audios: {remaining_audios} with extra videos {extra_audios} video sum {videos_sum}")
                    video_id_candidates = [
                        video_id
                        for video_id, audio_count in video_id_to_audios_count.items()
                        if video_id not in chosen_videos and audio_count <= remaining_audios
                    ]
                    if not video_id_candidates:
                        break

                    probabilities = [
                        audio_count # As all are less than remaining_audios, the bigger the closer to remaining_audios
                        for video_id, audio_count in video_id_to_audios_count.items()
                        if video_id in video_id_candidates
                    ]
                    normalized_probabilities = [prob / sum(probabilities) for prob in probabilities]
                    video_id = np.random.choice(a=video_id_candidates, size=1, replace=False, p=normalized_probabilities)[0]
                    chosen_videos.append(video_id)
                    videos_sum += video_id_to_audios_count[video_id]
                    print(f"chose: {video_id} with count {video_id_to_audios_count[video_id]} sum {videos_sum}")

                if videos_sum == sample_size + extra_audios:
                    found_group_of_videos = True
                elif retries == 0:
                    retries = RETRIES_DEFAULT
                    extra_audios_i += 1
                else:
                    retries -= 1

                if extra_audios_i >= len(extra_audios_list) or extra_audios / sample_size > 0.8:
                    raise ValueError(speaker_id)
            
            speaker_partition_df = speaker_df[speaker_df.video_id.isin(chosen_videos)].copy()
            df = df[~df.index.isin(speaker_partition_df.index)]  # Remove chosen
            speaker_partition_df['set'] = partition
            dfs.append(speaker_partition_df)

    # Train partition
    print((speaker_id, "train"))
    speaker_partition_df = df[df.speaker_id == speaker_id].copy()
    speaker_partition_df.drop("Set", axis=1, inplace=True)
    speaker_partition_df['set'] = "train"
    df = df[~df.index.isin(speaker_partition_df.index)]  # Remove chosen
    dfs.append(speaker_partition_df)

('id10986', 'validation')
{'33NaEPsiFEA': 4, '4p9Fm05hlIM': 11, '6SBGrr4iFp0': 50, '6c5ljEc2Clo': 3, '79oWAvMcAm0': 5, '94JZBdGzoxc': 1, 'GPPBWbOC1FM': 19, 'Gq881M1WnKM': 18, 'Hyck_yynhF4': 62, 'KH-yJAsKo1Q': 46, 'KZVJwFblEUo': 1, 'Pqwj7Sk4Qyw': 1, 'SfYt6Y6vrOI': 5, 'Vu__sFrtFfY': 5, 'Vzqmi3MAixQ': 127, 'XBqRyPiDJ9k': 1, 'XaXt-hdv2Js': 10, 'b7NFJBhE9EY': 5, 'd8H0ZdING7Y': 3, 'eWvntayRfe8': 89, 'fFnHb9jzYwU': 9, 'g7SptlsGbaA': 8, 'lFEBeMFuL_M': 131, 'p5qFkL8yVuI': 11, 's2077pq_vdY': 6, 'sOlWHKGy-SY': 122, 'tj0Mw2xTE74': 12, 'w7DmOHDIMQ8': 99, 'wJ_YINYHBwY': 25, 'wyklNur-auI': 1, 'xFTIClN5z70': 111, 'xmf3xBGKSoA': 1}
RETRY: 30
remaining_audios: 150 with extra videos 0 video sum 0
chose: d8H0ZdING7Y with count 3 sum 3
remaining_audios: 147 with extra videos 0 video sum 3
chose: xFTIClN5z70 with count 111 sum 114
remaining_audios: 36 with extra videos 0 video sum 114
chose: s2077pq_vdY with count 6 sum 120
remaining_audios: 30 with extra videos 0 video sum 120
chose: fFnHb9jzYwU with count

ValueError: id10105

In [40]:
df = state['dataset_metadata'].copy()
speaker_df = df[df.speaker_id == "id10105"].copy()
print(f"sample size {int(len(speaker_df) * v)}")
speaker_df.groupby(['video_id']).size().to_dict()

sample size 47


{'ArZCDzx68s0': 12,
 'F_LzPRjwV_0': 5,
 'J1UdMqcSkgU': 9,
 'LGTcAlSu2OA': 2,
 'WXu5vaO0lQM': 3,
 'Z4zvj6SOBZs': 8,
 'jt5KpM6eAeA': 3,
 'vJSc3FB4US0': 87,
 'wrHyTrC37FQ': 184,
 'yApWxyEm_Do': 3}

# Measure intra/inter video_id similarity

In [3]:
embeddings_dir = "/home/eernst/Voxceleb1/avg_embeddings/"

In [4]:
embeddings_paths = Path(embeddings_dir).rglob('*.pt')
embeddings = {}
for path in embeddings_paths:
    split_path = path.name.split("_")
    if "voxceleb1" in split_path:
        split_path.remove("voxceleb1")
    if "" in split_path:
        split_path.remove("")
    try:
        speaker_id, video_id, segment_id = split_path
    except:
        if len(split_path) == 4:
            speaker_id, video_id, segment_id = split_path[0], split_path[1] + split_path[2], split_path[3]
        
        elif len(split_path) == 5:
            speaker_id, video_id, segment_id = split_path[0], split_path[1] + split_path[2] + split_path[3], split_path[4]
        else:
            print(split_path)
            raise ValueError(path)
    segment_id = segment_id.replace(".pt", "")
    if speaker_id not in embeddings:
        embeddings[speaker_id] = {}
    if video_id not in embeddings[speaker_id]:
        embeddings[speaker_id][video_id] = {}
    with torch.no_grad():
        embeddings[speaker_id][video_id][segment_id] = torch.load(
            path,
            weights_only=True,
        )

In [5]:
len(embeddings)

123

In [6]:
def cosine_distances_matrix(a, b, eps=1e-8):
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_normalized = a / torch.clamp(a_n, min=eps)
    b_normalized = b / torch.clamp(b_n, min=eps)
    distances_matrix = 1 - torch.mm(a_normalized, b_normalized.transpose(0, 1))
    return distances_matrix

def mean_cosine_distance(vectors):
    distances_matrix = cosine_distances_matrix(vectors, vectors)
    different_distances = distances_matrix[torch.triu(distances_matrix, diagonal=1).nonzero(as_tuple=True)]
    return torch.mean(different_distances).item()

In [7]:
video_ids_to_remove = []
for speaker_id, video_id_dict in embeddings.items():
    for video_id, segment_id_dict in video_id_dict.items():
        video_id_embeddings = [embedding for embedding in segment_id_dict.values()]
        if len(video_id_embeddings) == 1:
            video_ids_to_remove.append((speaker_id, video_id))
for speaker_id, video_id in video_ids_to_remove:
    del embeddings[speaker_id][video_id]

In [8]:
len(video_ids_to_remove)

2260

In [9]:
speaker_to_remove = []
for speaker_id, video_id_dict in embeddings.items():
    if not video_id_dict or len(video_id_dict) == 1:
        speaker_to_remove.append(speaker_id)
for speaker_id in speaker_to_remove:
    del embeddings[speaker_id]

In [10]:
len(speaker_to_remove)

46

In [11]:
len(embeddings)

77

In [12]:
video_ids = 0
for video_ids_dict in embeddings.values():
    video_ids += len(video_ids_dict)
print(f"There are {video_ids} videos")

There are 1046 videos


In [18]:
intra_video_id_distances = {}
for speaker_id, video_id_dict in embeddings.items():
    intra_video_id_distances[speaker_id] = {}
    for video_id, segment_id_dict in video_id_dict.items():
        video_id_embeddings = [embedding for embedding in segment_id_dict.values()]

        layers_distances = []
        for layer in range(13):
            layer_embeddings = [embedding[layer].unsqueeze(dim=0) for embedding in video_id_embeddings]
            layer_embeddings = torch.cat(layer_embeddings, dim=0)
            layer_distance = mean_cosine_distance(layer_embeddings)
            layers_distances.append(layer_distance)

            intra_video_id_distance = np.mean(np.array(layers_distances))
            intra_video_id_distances[speaker_id][video_id] = intra_video_id_distance

In [19]:
len(intra_video_id_distances)

77

In [22]:
inter_video_id_distances = {}
for speaker_id, video_id_dict in embeddings.items():
    video_id_embeddings = []
    for video_id, segment_id_dict in video_id_dict.items():
        video_id_all_embeddings = [embedding.unsqueeze(dim=0) for embedding in segment_id_dict.values()]
        video_id_all_embeddings = torch.cat(video_id_all_embeddings, dim=0)
        video_id_mean_embedding = torch.mean(video_id_all_embeddings, dim=0)
        video_id_embeddings.append(video_id_mean_embedding)

    layers_distances = []
    for layer in range(13):
        layer_embeddings = [embedding[layer].unsqueeze(dim=0) for embedding in video_id_embeddings]
        layer_embeddings = torch.cat(layer_embeddings, dim=0)
        layer_distance = mean_cosine_distance(layer_embeddings)
        layers_distances.append(layer_distance)

    inter_video_id_distance = np.mean(np.array(layers_distances))
    inter_video_id_distances[speaker_id] = inter_video_id_distance

In [42]:
rows = []
for speaker_id in embeddings:
    inter_video_id_distance = inter_video_id_distances[speaker_id]
    for speaker_id, video_id_dict in embeddings.items():
        for video_id in video_id_dict:
            intra_video_id_distance = intra_video_id_distances[speaker_id][video_id]
            intra_v_inter_distance = abs(inter_video_id_distance - intra_video_id_distance)
            rows.append([speaker_id, video_id, inter_video_id_distance, intra_video_id_distance, intra_v_inter_distance])

In [43]:
df_distances = pd.DataFrame(rows, columns=["speaker_id", "video_id", "inter_video_id_distance", "intra_video_id_distance", "intra_v_inter_distance_diff"])
df_distances

Unnamed: 0,speaker_id,video_id,inter_video_id_distance,intra_video_id_distance,intra_v_inter_distance_diff
0,id10856,URycIznEI,0.143094,0.175446,0.032351
1,id10856,lc1yDLHEKT8,0.143094,0.127802,0.015292
2,id10856,qlkFLSNBKvI,0.143094,0.221063,0.077969
3,id10856,cNvo7HjL8ys,0.143094,0.175990,0.032896
4,id10856,1luQazgw6jo,0.143094,0.227188,0.084093
...,...,...,...,...,...
80537,id11228,bM6TWdTWGKA,0.164588,0.112949,0.051640
80538,id11228,DGNRvw7sNLM,0.164588,0.122627,0.041962
80539,id11228,D851gSJLDak,0.164588,0.135522,0.029066
80540,id11228,avOhqSI8AbU,0.164588,0.142493,0.022095


In [28]:
df_distances.intra_v_inter_distance_diff.describe()

count    8.054200e+04
mean     3.505817e-02
std      2.853617e-02
min      1.097528e-07
25%      1.377374e-02
50%      2.903220e-02
75%      4.945592e-02
max      2.824960e-01
Name: intra_v_inter_distance_diff, dtype: float64

In [30]:
df_distances.intra_video_id_distance.describe()

count    80542.000000
mean         0.152814
std          0.042056
min          0.017197
25%          0.123909
50%          0.147991
75%          0.175996
max          0.402956
Name: intra_video_id_distance, dtype: float64

In [31]:
df_distances.inter_video_id_distance.describe()

count    80542.000000
mean         0.152904
std          0.016574
min          0.120460
25%          0.139252
50%          0.152792
75%          0.165144
max          0.186033
Name: inter_video_id_distance, dtype: float64

In [24]:
len(intra_video_id_distances.values()) == video_ids # One by video_id

False

In [61]:
len(inter_video_id_distances) == len(embeddings) # One by speaker

True

In [62]:
len(inter_video_id_distances)

77

In [63]:
mean_intra_video_id_distance

0.15281383805212545

In [64]:
mean_inter_video_id_distance

0.1529041120408477

In [65]:
print(f"inter-video distance is bigger than intra-video distance for a factor of {(mean_inter_video_id_distance / mean_intra_video_id_distance) - 1}")

inter-video distance is bigger than intra-video distance for a factor of 0.0005907448557862516
