In [2]:
import random
import os
import sys
import pandas as pd
import torch
import torchaudio
import torch.nn.functional as F
import torchaudio.transforms as T
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import json
import numpy as np
import math
import seaborn as sns
sns.set_theme()
sns.set_style("ticks")
sns.set_context("paper")

from typing import Optional

from datetime import timedelta

from src.utils import (
    create_dataset, plot_spectrogram,
    RandomClip, extract_logmel, pad_tensor, plot_waveform
)
from src.datasets import VoxCelebDataModule
from src.models import SEBlock, SpeakerRecognitionModel, build_efficientnetv2
from torch import nn
from sklearn.decomposition import PCA

from src.losses import SubCenterAAMSoftmaxLoss
from sklearn.cluster import KMeans
from sklearn.metrics import roc_curve, accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from src.utils import (
    RandomBackgroundNoise, RandomClip, RandomSpeedChange,
    create_features_from_row, kmeans_plot
)
from tqdm.auto import tqdm
from pedalboard import Pedalboard, Reverb, Chorus

In [None]:
SAMPLE_WAV_SPEECH_PATH = "E:\Datasets\VoxCeleb1\\vox1_dev\id10015\\7rzuEmfRFEA\\00001.wav"
waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)

In [None]:
melspec = torch.load("E:\Datasets\VoxCeleb1\subset\\verification_None\\test\id10153\\x29OJk3Ec-Q\\00001_.pt")

In [None]:
melspec = extract_logmel(waveform, n_mels=80)

In [None]:
mfcc_t = torchaudio.transforms.MFCC(
    sample_rate=16000,
    n_mfcc=40
)

In [None]:
mfcc = mfcc_t(waveform)

In [None]:
plot_spectrogram(melspec[0])

In [None]:
spr = torchaudio.transforms.Spectrogram()
sprr = spr(waveform)

In [None]:
sprr2 = librosa.feature.melspectrogram(
    y=waveform.numpy()[0],
    sr=16000,
    power=1
)

In [None]:
plot_spectrogram(np.log(sprr2))

In [None]:
SAMPLE_WAV_SPEECH_PATH = "/media/gabriele/Seagate Expansion Drive/Datasets/VoxCeleb1/vox1_dev/id10001/1zcIwhmdeo4/00001.wav"
waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)

In [None]:
waveform, sample_rate

In [None]:
waveform.shape

In [None]:
reverb = Pedalboard(
    [Reverb(room_size=0.75)]
)

In [None]:
board = Pedalboard([Chorus(), Reverb(room_size=0.25)])

In [None]:
board(waveform, sample_rate=16000)

In [None]:
reverb(waveform, sample_rate=16000)

In [None]:
rc = RandomClip()

In [None]:
clipped = rc(waveform)
clipped.shape

In [None]:
n_fft = 512
mel_spectrogram = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=400,
        hop_length=160,
        center=True,
        pad_mode="reflect",
        power=2.0, # energy instead of power
        norm="slaney",
        onesided=True,
        n_mels=80,
        mel_scale="htk",
    )

In [None]:
cmn = T.SlidingWindowCmn(cmn_window=n_fft)
to_db = T.AmplitudeToDB(stype="amplitude")

In [None]:
logmel = mel_spectrogram(clipped)
logmel.shape

In [None]:
logmel

In [None]:
logmel = extract_logmel(clipped, sample_rate=16000, n_mels=80)
logmel.shape

In [None]:
logmel = torch.load("E:\Datasets\VoxCeleb1\subset\\features_3\\train\id10075\\08H4--mL1LQ\\00001_.pt")

In [None]:
logeml = logmel + 1e-6
logeml = torch.log(logeml)
logmel = F.instance_norm(logmel)

In [None]:
plot_spectrogram(melspec[0])

In [None]:
freq_masking = T.FrequencyMasking(freq_mask_param=10)
time_masking = T.TimeMasking(time_mask_param=5)

In [None]:
spec_logmel = time_masking(freq_masking(logmel))

In [None]:
plot_spectrogram(time_masking(freq_masking(logmel))[0])

In [3]:
num_secs = 3
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
label_dict = pd.read_csv(
    csv_base_path + f"subset_labels_{num_secs}.csv"
).to_dict()["label"]

In [4]:
speaker_ids = list(label_dict.keys())
len(speaker_ids)

100

In [5]:
create_dataset(
    num_speakers=100, 
    to_db_flag=True,
    cmn_flag=True,
    clip_secs=4,
    n_fft=400,
    win_length=400,
    hop_length=160,
    n_mels=80,
    power=1.0,
    data_aug=True,
    speaker_ids=speaker_ids,
    full_test=True,
    wave_test=True
    # base_path="/media/gabriele/Seagate Expansion Drive/Datasets/VoxCeleb1/",
    # noise_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/noise",
    # babble_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/speech"
)

train    11828
test       665
val        549
Name: Set, dtype: int64
Num speakers: 100
Male ratio in dataset: 0.5515587529976019
Female ratio in dataset: 0.44844124700239807
Male sampled ratio: 0.55
Female sampled ratio: 0.45
Num sampled males: 55
Num sampled females: 45



                                                                        

In [None]:
df = pd.read_csv("E:\Datasets\VoxCeleb1\subset\subset_labels_3.csv")
speaker_ids = df.index.tolist()

In [None]:
create_dataset(
    num_speakers=5,
    clip_secs=6, 
    to_db_flag=True, 
    cmn_flag=True,
    speaker_ids=speaker_ids
    # base_path="/media/gabriele/Seagate Expansion Drive/Datasets/VoxCeleb1/",
    # noise_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/noise",
    # babble_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/speech"
)

In [None]:
num_secs = 3
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
set_name: str = "train"
df = pd.read_csv(
            csv_base_path + f"subset_features_{num_secs}.csv"
        )

In [None]:
df

In [None]:
df_train = df[df["Set"] == "train"]
df_train

In [None]:
label_dict = pd.read_csv(
            csv_base_path + f"subset_labels_{num_secs}.csv"
        ).to_dict()["label"]

In [None]:
label_dict

In [None]:
melspecs = []
y = []
for idx, row in df.iterrows():
    melspec = torch.load(row["File"]).numpy()
    melspecs.append(
        melspec
    )
    y.append(
        label_dict[row["Speaker"]]
    )

In [None]:
X = np.vstack(melspecs)
X.shape

In [None]:
X = X.reshape(X.shape[0], 80*301)
X.shape

In [None]:
y = np.vstack(y).squeeze(-1)
y.shape

In [None]:
kmeans = KMeans(n_clusters=8)

In [None]:
kmeans.fit(X)

In [None]:
pca = PCA(2)

In [None]:
Xpca = pca.fit_transform(X)

In [None]:
u_y = np.unique(y)

In [None]:
y

In [None]:
Xpca[y == 0]

In [None]:
for label in u_y:
    plt.scatter(
        Xpca[y == label, 0], 
        Xpca[y == label, 1], 
        label=label,
        alpha=0.7
    )
plt.legend()
plt.show()

In [None]:
svc = SVC()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
svc.fit(X_train,y_train)

In [None]:
pred = svc.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
mlp = MLPClassifier()

In [None]:
mlp.fit(X_train, y_train)

In [None]:
pred = mlp.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
optimizer = torch.optim.AdamW(
    [torch.randn((4,5))], 
    lr=1e-3, 
    eps=1e-8
)

In [None]:
optimizer.state_dict()

In [None]:
optimizer.state_dict()["param_groups"][0]

In [None]:
import random
random.randint(0,4)

In [None]:
waveform = torch.load(
    "E:\Datasets\VoxCeleb1\subset\\features_4\\train\id10206\\0dATli9-ofc\\00001_.pt"
)

In [None]:
waveform.shape

In [None]:
logmel = extract_logmel(waveform)

In [None]:
logmel.shape

In [None]:
res = pad_tensor(
    logmel, 401, 450
)

In [None]:
res[0][1]

In [None]:
kmeans_plot()

In [None]:
num_secs = 3
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
label_dict = pd.read_csv(
    csv_base_path + f"subset_labels_{num_secs}.csv"
).to_dict()["label"]

speaker_ids = list(label_dict.keys())
len(speaker_ids)

In [None]:
def create_verification_dataset(
    speaker_ids,
    num_speakers: int = 10,
    base_path: str = "E:/Datasets/VoxCeleb1/",
    n_mels: int = 80,
    power: float = 1.0, # 1 for energy, 2 for power
    to_db_flag: bool = True,
    cmn_flag: bool = True,
    n_fft: int = 400,
    win_length: int = None,
    hop_length: int = 160,
    data_aug: bool = False,
    full_test: bool=True
):
    ls = []
    with open(base_path + "iden_split.txt") as file:
        
        gender_df = pd.read_csv(base_path + "vox1_meta.csv", sep="\t")
        m_ratio = gender_df["Gender"].value_counts(normalize=True)["m"]
        f_ratio = gender_df["Gender"].value_counts(normalize=True)["f"]
        n_males = int(num_speakers * m_ratio)
        n_females = num_speakers - n_males

        restricted_df = gender_df[~gender_df["VoxCeleb1 ID"].isin(speaker_ids)]
        
        male_ids = random.sample(
            list(
                restricted_df[restricted_df["Gender"] == "m"]["VoxCeleb1 ID"].unique()
            ),
            n_males
        )
        female_ids = random.sample(
            list(
                restricted_df[restricted_df["Gender"] == "f"]["VoxCeleb1 ID"].unique()
            ),
            n_females
        )
        chosen_ids = male_ids + female_ids

        print(chosen_ids)

        for line in file:
            set_num, audio_path = line.split()
            speaker_id = audio_path.split("/")[0]
            if speaker_id not in chosen_ids:
                continue
            gender = list(
                gender_df[gender_df["VoxCeleb1 ID"] == speaker_id]["Gender"]
            )[0]
            ls.append((set_num, speaker_id, gender, audio_path))

    df = pd.DataFrame(ls, columns =["Set", "Speaker", "Gender", "File"])
    df["Set"] = df["Set"].apply(
        lambda x: "train" if x == "1" else "val" if x == "2" else "test"
    )

    m_sampled_ratio = df.drop_duplicates("Speaker")["Gender"].value_counts(normalize=True)["m"]
    f_sampled_ratio = df.drop_duplicates("Speaker")["Gender"].value_counts(normalize=True)["f"]

    print(
        f"Num speakers: {num_speakers}\n"
        f"Male ratio in dataset: {m_ratio}\n"
        f"Female ratio in dataset: {f_ratio}\n"
        f"Male sampled ratio: {m_sampled_ratio}\n"
        f"Female sampled ratio: {f_sampled_ratio}\n"
        f"Num sampled males: {n_males}\n"
        f"Num sampled females: {n_females}\n"
    )

    ls = []
    for index, row in tqdm(
        df.iterrows(),
        total=len(df),
        desc="Creating verification dataset",
        leave=False
    ):
        # copy_audio(row, base_path)
        feat_ls = create_features_from_row(
            row=row, 
            base_path=base_path,
            rsc=None,
            rbn=None,
            reverb=None,
            babble=None,
            random_clip=None,
            clip_secs=None,
            n_mels=n_mels,
            power=power,
            to_db_flag=to_db_flag,
            cmn_flag=cmn_flag,
            n_fft=n_fft,
            win_length=win_length,
            hop_length=hop_length,
            data_aug=data_aug,
            features_dir="verification"
        )
        ls.extend(feat_ls)

    df = pd.DataFrame(
        ls, 
        columns = [
            "Set", "Speaker", "Type", "Augment", 
            "Seconds", "Path", "File"
        ]
    )

    csv_base_path = base_path + "subset/"

    df.to_csv(
        csv_base_path + f"subset_verification.csv", 
        index_label=False
    )

In [None]:
create_verification_dataset(
    speaker_ids=speaker_ids,
    num_speakers=10
)