In [2]:
import random
import os
import sys
import pandas as pd
import torch
import torchaudio
import torch.nn.functional as F
import torchaudio.transforms as T
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import json
import numpy as np
import math
import seaborn as sns
sns.set_theme()
sns.set_style("ticks")
sns.set_context("paper")

from typing import Optional

from datetime import timedelta

from src.utils import (
    create_dataset, plot_spectrogram,
    RandomClip, extract_logmel, pad_tensor, plot_waveform,
    train_sklearn_model, get_dataset_stats
)
from src.datasets import VoxCelebDataModule
from src.models import SEBlock, SpeakerRecognitionModel, build_efficientnetv2
from torch import nn
from sklearn.decomposition import PCA

from src.losses import SubCenterAAMSoftmaxLoss
from sklearn.cluster import KMeans
from sklearn.metrics import roc_curve, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from src.utils import (
    RandomBackgroundNoise, RandomClip, RandomSpeedChange,
    create_features_from_row, kmeans_plot, create_verification_dataset
)
from tqdm.auto import tqdm
from pedalboard import Pedalboard, Reverb, Chorus

In [None]:
SAMPLE_WAV_SPEECH_PATH = "E:\Datasets\VoxCeleb1\\vox1_dev\id10015\\7rzuEmfRFEA\\00001.wav"
waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)

In [None]:
melspec = torch.load("E:\Datasets\VoxCeleb1\subset\\verification_None\\test\id10153\\x29OJk3Ec-Q\\00001_.pt")

In [None]:
melspec = extract_logmel(waveform, n_mels=80)

In [None]:
mfcc_t = torchaudio.transforms.MFCC(
    sample_rate=16000,
    n_mfcc=40
)

In [None]:
mfcc = mfcc_t(waveform)

In [None]:
plot_spectrogram(melspec[0])

In [None]:
spr = torchaudio.transforms.Spectrogram()
sprr = spr(waveform)

In [None]:
sprr2 = librosa.feature.melspectrogram(
    y=waveform.numpy()[0],
    sr=16000,
    power=1
)

In [None]:
plot_spectrogram(np.log(sprr2))

In [None]:
SAMPLE_WAV_SPEECH_PATH = "/media/gabriele/Seagate Expansion Drive/Datasets/VoxCeleb1/vox1_dev/id10001/1zcIwhmdeo4/00001.wav"
waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)

In [None]:
waveform, sample_rate

In [None]:
waveform.shape

In [None]:
reverb = Pedalboard(
    [Reverb(room_size=0.75)]
)

In [None]:
board = Pedalboard([Chorus(), Reverb(room_size=0.25)])

In [None]:
board(waveform, sample_rate=16000)

In [None]:
reverb(waveform, sample_rate=16000)

In [None]:
rc = RandomClip()

In [None]:
clipped = rc(waveform)
clipped.shape

In [None]:
n_fft = 512
mel_spectrogram = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=400,
        hop_length=160,
        center=True,
        pad_mode="reflect",
        power=2.0, # energy instead of power
        norm="slaney",
        onesided=True,
        n_mels=80,
        mel_scale="htk",
    )

In [None]:
cmn = T.SlidingWindowCmn(cmn_window=n_fft)
to_db = T.AmplitudeToDB(stype="amplitude")

In [None]:
logmel = mel_spectrogram(clipped)
logmel.shape

In [None]:
logmel

In [None]:
logmel = extract_logmel(clipped, sample_rate=16000, n_mels=80)
logmel.shape

In [None]:
logmel = torch.load("E:\Datasets\VoxCeleb1\subset\\features_3\\train\id10075\\08H4--mL1LQ\\00001_.pt")

In [None]:
logeml = logmel + 1e-6
logeml = torch.log(logeml)
logmel = F.instance_norm(logmel)

In [None]:
plot_spectrogram(melspec[0])

In [None]:
freq_masking = T.FrequencyMasking(freq_mask_param=10)
time_masking = T.TimeMasking(time_mask_param=5)

In [None]:
spec_logmel = time_masking(freq_masking(logmel))

In [None]:
plot_spectrogram(time_masking(freq_masking(logmel))[0])

In [None]:
num_secs = 3
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
label_dict = pd.read_csv(
    csv_base_path + f"subset_labels_{num_secs}.csv"
).to_dict()["label"]

In [None]:
speaker_ids = list(label_dict.keys())
len(speaker_ids)

In [None]:
create_dataset(
    num_speakers=100, 
    to_db_flag=True,
    cmn_flag=True,
    clip_secs=4,
    n_fft=400,
    win_length=400,
    hop_length=160,
    n_mels=80,
    power=1.0,
    data_aug=False,
    speaker_ids=speaker_ids,
    full_test=True,
    wave_test=True
    # base_path="/media/gabriele/Seagate Expansion Drive/Datasets/VoxCeleb1/",
    # noise_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/noise",
    # babble_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/speech"
)

In [None]:
df = pd.read_csv("E:\Datasets\VoxCeleb1\subset\subset_labels_3.csv")
speaker_ids = df.index.tolist()

In [None]:
create_dataset(
    num_speakers=5,
    clip_secs=6, 
    to_db_flag=True, 
    cmn_flag=True,
    speaker_ids=speaker_ids
    # base_path="/media/gabriele/Seagate Expansion Drive/Datasets/VoxCeleb1/",
    # noise_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/noise",
    # babble_dir="/media/gabriele/Seagate Expansion Drive/Datasets/Musan/speech"
)

In [5]:
num_secs = 4
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
set_name: str = "train"
df = pd.read_csv(
            csv_base_path + f"subset_features_{num_secs}.csv"
        )

In [7]:
len(df[df["Set"] == "train"])

59140

In [None]:
df

In [None]:
df_train = df[df["Set"] == "train"]
df_train

In [None]:
label_dict = pd.read_csv(
            csv_base_path + f"subset_labels_{num_secs}.csv"
        ).to_dict()["label"]

In [None]:
label_dict

In [None]:
melspecs = []
y = []
for idx, row in df.iterrows():
    melspec = torch.load(row["File"]).numpy()
    melspecs.append(
        melspec
    )
    y.append(
        label_dict[row["Speaker"]]
    )

In [None]:
X = np.vstack(melspecs)
X.shape

In [None]:
X = X.reshape(X.shape[0], 80*301)
X.shape

In [None]:
y = np.vstack(y).squeeze(-1)
y.shape

In [None]:
kmeans = KMeans(n_clusters=8)

In [None]:
kmeans.fit(X)

In [None]:
pca = PCA(2)

In [None]:
Xpca = pca.fit_transform(X)

In [None]:
u_y = np.unique(y)

In [None]:
y

In [None]:
Xpca[y == 0]

In [None]:
for label in u_y:
    plt.scatter(
        Xpca[y == label, 0], 
        Xpca[y == label, 1], 
        label=label,
        alpha=0.7
    )
plt.legend()
plt.show()

In [None]:
svc = SVC()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
svc.fit(X_train,y_train)

In [None]:
pred = svc.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
mlp = MLPClassifier()

In [None]:
mlp.fit(X_train, y_train)

In [None]:
pred = mlp.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
optimizer = torch.optim.AdamW(
    [torch.randn((4,5))], 
    lr=1e-3, 
    eps=1e-8
)

In [None]:
optimizer.state_dict()

In [None]:
optimizer.state_dict()["param_groups"][0]

In [None]:
import random
random.randint(0,4)

In [None]:
waveform = torch.load(
    "E:\Datasets\VoxCeleb1\subset\\features_4\\train\id10206\\0dATli9-ofc\\00001_.pt"
)

In [None]:
waveform.shape

In [None]:
logmel = extract_logmel(waveform)

In [None]:
logmel.shape

In [None]:
res = pad_tensor(
    logmel, 401, 450
)

In [None]:
res[0][1]

In [None]:
kmeans_plot()

In [4]:
num_secs = 4
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
label_dict = pd.read_csv(
    csv_base_path + f"subset_labels_{num_secs}.csv"
).to_dict()["label"]

speaker_ids = list(label_dict.keys())
len(speaker_ids)

100

In [None]:
csv_base_path: str = "E:/Datasets/VoxCeleb1/subset/"
ver_df = pd.read_csv(
    csv_base_path + f"subset_verification_4.csv"
)
ver_df

In [None]:
chosen_ids = ver_df["Speaker"].unique()

In [None]:
create_verification_dataset(
    speaker_ids=speaker_ids,
    num_speakers=10
)

In [None]:
get_dataset_stats()

In [None]:
svc = SVC()

In [None]:
y_pred, y_true = train_sklearn_model(
    svc,
    limited_train=50
)

In [None]:
accuracy_score(y_true, y_pred)

In [None]:
round(0.13984962406015036 * 100, 2)

In [None]:
f1_score(y_true, y_pred, average="weighted")

In [None]:
round(0.6425952911376953 * 100, 2), round(0.0480, 3)

In [None]:
kmeans_plot(limited_train=50)

In [None]:
a = torch.load(
   "E:\Datasets\VoxCeleb1\subset\\features_4\\train\id10118\_QrOkTwSOeE\\00001_.pt" 
)

In [3]:
get_dataset_stats()

Starting
Got the dataframes
To analyze entire dataset


Analyzing entire dataset: 100%|██████████| 153516/153516 [36:36<00:00, 69.89it/s] 


Finished analysis of entire dataset
********************
GENERAL STATS
********************
Samples in entire dataset: 153516
Samples in identification subset (without augment): 13042
Samples in verification subset: 758
Speakers in entire dataset: 1251
Speakers in identification subset: 100
Speakers in verification subset: 10
Gender in entire dataset:
m    0.55
f    0.45
Name: Gender, dtype: float64
Gender in identification subset:
m    0.55
f    0.45
dtype: float64
Gender in verification subset:
m    0.5
f    0.5
dtype: float64
Nationality in entire dataset:
USA                    0.64
UK                     0.17
Canada                 0.04
Australia              0.03
India                  0.02
Norway                 0.02
Ireland                0.01
Germany                0.01
New Zealand            0.01
Italy                  0.01
Mexico                 0.01
Sweden                 0.00
Russia                 0.00
Spain                  0.00
Philippines            0.00
Croatia       