In [25]:
from models.resnet.resnet_se_34v2 import ResNetSE34V2
from torchaudio.transforms import MelSpectrogram
from torchaudio.functional import amplitude_to_DB
import torch
import torchaudio
import yaml
import librosa
import numpy as np
import os 

In [26]:
#Get all path from the audio directory and save it to a list
def get_all_audio_path(audio_dir):
    audio_path = []
    for root, dirs, files in os.walk(audio_dir):
        for file in files:
            if file.endswith(".wav"):
                audio_path.append(os.path.join(root, file))
    return audio_path

In [27]:
audio_paths = get_all_audio_path("audio/speaker_segments/")

In [None]:
def load_audio(file):
    EPS = 1e-8
    s, _ = librosa.load(file, sr=16000)
    amax = np.max(np.abs(s))
    factor = 1.0 / (amax + EPS)
    s = s * factor
    return s

In [56]:
with open('./models/resnet/config.yaml') as f:
    config = yaml.safe_load(f)

sd = torch.load('./models/weigths/resnetse34_epoch92_eer0.00931.pth')
model = ResNetSE34V2(nOut=256, n_mels=config['fbank']['n_mels'])
model.load_state_dict(sd)
model.eval()
torch.set_grad_enabled(False)

transform = MelSpectrogram(
    sample_rate=config['fbank']['sr'],
    n_fft=config['fbank']['n_fft'],
    win_length=config['fbank']['win_length'],
    hop_length=config['fbank']['hop_length'],
    window_fn=torch.hamming_window,
    n_mels=config['fbank']['n_mels'],
    f_min=config['fbank']['f_min'],
    f_max=config['fbank']['f_max'],
    norm='slaney')


def embed_inference(audio_path, transform= transform, model = model):
    s = load_audio(audio_path)
    x = torch.tensor(s[None, :])
    x = transform(x)
    x = amplitude_to_DB(
        x, multiplier=10, amin=config['fbank']['amin'], db_multiplier=0, top_db=75)

    feature = model(x[:, None, :, :])
    feature = torch.nn.functional.normalize(feature)
    return(feature)

Embedding size is 256, encoder SAP.


In [72]:
loss = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

In [76]:
import pandas as pd

In [79]:

audio_paths_compar = audio_paths[1:]
ref_embed = embed_inference(audio_paths[0])

compar_tab = pd.DataFrame(columns = ['path_1', 'path_2', 'score'])
for path_1 in audio_paths_compar :
    ref_embed = embed_inference(path_1)
    for path_2 in audio_paths_compar :
        embed = embed_inference(path_2)
        #print(embed)
        #print(ref_embed)
        loss_value = loss(ref_embed, embed)
       

        compar_tab.loc[len(compar_tab)] = [path_1, path_2, loss_value.item()]
        compar_tab.to_csv('compar_tab.csv', index=False)


In [69]:
test = torch.randn(1, 1)


In [70]:
print(test.item())

0.467571496963501
