# Adaptive Score normalization
This paper did a great job in explaining Score normalization: 
https://www.sciencedirect.com/science/article/pii/S1051200499903603
Notice that equation (1) in this paper is Bayes' Theorem (ignoring P(m) and using P(O|m_W) to represent P(O)).
About adaptive score normalization: 
https://www.isca-speech.org/archive/interspeech_2017/matejka17_interspeech.html

This code is adapted from: 
https://github.com/juanmc2005/SpeakerEmbeddingLossComparison/blob/master/reproduce.ipynb

In [1]:
import numpy as np
import glob, os
from tqdm import tqdm
from kaldi_io import open_or_fd, read_vec_flt
from xarray import DataArray
from scipy.spatial.distance import cdist
from feerci import feerci

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



## Define helper function to load speaker embeddings from ark files

In [2]:
def collect_vec(scpfile):
    cohort_embedding = dict()
    test_embedding = dict()
    
    vecs = []
    fd = open_or_fd(scpfile)
    for line in tqdm(fd):
        key, rxfile = line.decode().split('\t')
        vecs = read_vec_flt(rxfile)
        
        # choose two dict to store the vecs
        spkid = key.split('-')[0]
        spknum = int(spkid.split('id')[-1])
        if spknum <= 10309 and spknum >= 10270:
            test_embedding.setdefault(key, []).append(vecs)
        else:
            cohort_embedding.setdefault(spkid, []).append(vecs)
        
    return cohort_embedding, test_embedding

# Get trial embeddings from VoxCeleb1
def get_trial_vec(testEmbedding, trialFile):
    # embedding mapping
    cache1 = dict()
    cache2 = dict()
    
    # hash to index mapping
    index1 = dict()
    index2 = dict()

    n_file1 = 0
    n_file2 = 0
    
    with open(trialFile, 'r') as tf:
        for trialPair in tf:
            file1, file2 = trialPair.split(' ')[1], trialPair.split(' ')[2]
            key1 = file1.split('/')[0] + '-' + file1.split('/')[1] + \
                    '-' + file1.split('/')[-1].split('.wav')[0]
            key2 = file2.split('/')[0] + '-' + file2.split('/')[1] + \
                    '-' + file2.split('/')[-1].split('.wav')[0]
            if key1 not in cache1:
                cache1[key1] = testEmbedding[key1]
                index1[key1] = n_file1
                n_file1 += 1
                
            if key2 not in cache2:
                cache2[key2] = testEmbedding[key2]
                index2[key2] = n_file2
                n_file2 += 1

    hashes1 = list(cache1.keys())
    hashes2 = list(cache2.keys())
    emb1 = np.vstack(list(cache1.values()))
    emb2 = np.vstack(list(cache2.values()))
    
    distance = DataArray(cdist(emb1, emb2, metric='cosine'),
                         dims=('file1', 'file2'),
                         coords=(hashes1, hashes2))
    
    return cache1, cache2, index1, index2, distance 

# A function to calculate the EER on a subset of VoxCeleb1_X
def run_experiment(index1, index2, distance, trialFile):
    
    y_pred, y_true = [], []
    with open(trialFile, 'r') as tf:
        for trialPair in tf:
            file1, file2 = trialPair.split(' ')[1], trialPair.split(' ')[2]
            key1 = file1.split('/')[0] + '-' + file1.split('/')[1] + \
                    '-' + file1.split('/')[-1].split('.wav')[0]
            key2 = file2.split('/')[0] + '-' + file2.split('/')[1] + \
                    '-' + file2.split('/')[-1].split('.wav')[0]
            y_pred.append(distance.data[index1[key1], index2[key2]])
            y_true.append(int(trialPair.split(' ')[0]))
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    eer, ci_lower, ci_upper, _ = feerci(-y_pred[y_true == 0],
                                        -y_pred[y_true == 1],
                                        is_sorted=False)
    return {
        'eer': eer,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'y_true': y_true,
        'y_pred': y_pred}

In [3]:
exp_folder = "/home/ge/kaldi/egs/voxceleb/pytorch/exp/ecapa-tdnn-tf-new"
trial_file = '/storage/ge/voxceleb/test/wav/veri_test2.txt'
scp_file = os.path.join(exp_folder, 'kaldi_ark', 'vox1', 'xvector_a_vox1.scp')
cohortEmbeds, testEmbeds = collect_vec(scp_file)


153516it [00:05, 26120.59it/s]


In [4]:
cache1, cache2, index1, index2, distances = get_trial_vec(testEmbeds, trial_file)

In [5]:
eer_dict = run_experiment(index1, index2, distances, trial_file)
print(eer_dict)

{'eer': 0.0174458809196949, 'ci_lower': 0.016435295343399048, 'ci_upper': 0.01882878504693508, 'y_true': array([1, 0, 1, ..., 0, 1, 0]), 'y_pred': array([0.45178772, 0.94382471, 0.43102617, ..., 0.99288202, 0.5142736 ,
       1.02745815])}


In [6]:
# Get cohort embeddings from VoxCeleb1
cohortSpkrs = list(cohortEmbeds.keys())
cohort = np.vstack([np.mean(np.vstack(cohortEmbeds[speaker]), axis=0, keepdims=True) 
                    for speaker in cohortSpkrs])

## Similarities on train and val dataset

In [7]:
hashes1 = list(cache1.keys())
hashes2 = list(cache2.keys())

emb1 = np.vstack(list(cache1.values()))
emb2 = np.vstack(list(cache2.values()))
# Calculate the distances between each trial embedding (file1 and file2) and the cohort
distance1 = DataArray(
    cdist(emb1, cohort, metric='cosine'),
    dims=('file1', 'cohort'),
    coords=(hashes1, cohortSpkrs))

distance2 = DataArray(
    cdist(emb2, cohort, metric='cosine'),
    dims=('file2', 'cohort'),
    coords=(hashes2, cohortSpkrs))

## Normalize scores w.r.t the N most similar cohort embeddings¶


In [9]:
# This is our N
COHORT_SIZE = 400

# Calculate mean and std of N most similar cohort embeddings for file1
data1 = np.partition(distance1.data, COHORT_SIZE)[:, :COHORT_SIZE]
mz = np.mean(data1, axis=1) 
sz = np.std(data1, axis=1)
mz = DataArray(mz, dims=('file1',), coords=(hashes1,))
sz = DataArray(sz, dims=('file1',), coords=(hashes1,))

# Calculate mean and std of N most similar cohort embeddings for file2
data2 = np.partition(distance2.data, COHORT_SIZE)[:, :COHORT_SIZE]
mt = np.mean(data2, axis=1) 
st = np.std(data2, axis=1)
mt = DataArray(mt, dims=('file2',), coords=(hashes2,))
st = DataArray(st, dims=('file2',), coords=(hashes2,))

# Normalize
distance_z = (distances - mz) / sz
distance_t = (distances - mt) / st
distance_s = 0.5 * (distance_z + distance_t)

In [10]:
snorm_eer_dict = run_experiment(index1, index2, distance_s, trial_file)
print(snorm_eer_dict)

{'eer': 0.016322851181030273, 'ci_lower': 0.015105579048395157, 'ci_upper': 0.018077433109283447, 'y_true': array([1, 0, 1, ..., 0, 1, 0]), 'y_pred': array([-6.74822005,  0.88052302, -6.67566938, ...,  1.96855125,
       -4.77046657,  2.83883655])}
