# Imports

In [51]:
import sys
sys.path.append('/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/scripts/')
import os
import librosa
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import torch
from model import SpeakerClassifier
import pickle

# Functions

In [2]:
def load_labels(paths):
    
    labels = []
    for path in paths:
        with open(path, 'r') as data_labels_file:
            labels = labels + data_labels_file.readlines()
    return labels

In [14]:
def format_audio_path(audio_path, prepend_directories):

    # we need to remove the first "/" to join paths
    if audio_path[0] == "/":
        audio_path = audio_path[1:]

    # remove the file extension, if has
    if len(audio_path.split(".")) > 1:
        audio_path = '.'.join(audio_path.split(".")[:-1]) 

    # We prepend prepend_directory to the paths and the file extension
    data_founded = False
    for dir in prepend_directories:

        if data_founded == False:

            for audio_format in ['wav', 'm4a']:

                audio_file = f"{audio_path}.{audio_format}"
                complete_audio_file_path = os.path.join(dir, audio_file)

                if os.path.exists(complete_audio_file_path):
                    data_founded = True
                    break

    assert data_founded, f"{audio_path} not founded."

    return complete_audio_file_path

In [15]:
def get_audio_paths(labels_list, prepend_directories):

    speaker_1_audio_paths = []
    speaker_2_audio_paths = []
    total_lines = len(labels_list)
    progress_pctg_to_print = 0
    for index, label in enumerate(labels_list):

        label = label.replace('\n', '')
        label_chunks = label.split(' ')

        if len(label_chunks) == 2:

            # Validation or Test labels
            # label is of the form '/speaker/interview/file /speaker/interview/file'
            audio_path_1, audio_path_2 = label_chunks
            audio_path_1 = format_audio_path(audio_path_1, prepend_directories)
            audio_path_2 = format_audio_path(audio_path_2, prepend_directories)
            speaker_1_audio_paths.append(audio_path_1)
            speaker_2_audio_paths.append(audio_path_2)

        else:

            assert False, f"{label} has a not expected structure."

        progress_pctg = index / total_lines * 100
        if progress_pctg >=  progress_pctg_to_print:
            #print(f"{progress_pctg:.0f}% paths processed...")
            progress_pctg_to_print = progress_pctg_to_print + 1
            
    return speaker_1_audio_paths, speaker_2_audio_paths

# Settings

In [8]:
prepend_directories = [
    '/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCeleb2/dev/',
]

clients_labels_file_name = [
    '/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/labels/valid/voxceleb_2/22_12_09_12_41_06_12nc5wq4_fiery-donkey-13/clients.ndx',
]
impostors_labels_file_name = [
    '/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/labels/valid/voxceleb_2/22_12_09_12_41_29_ikfavyhj_ruby-microwave-14/impostors.ndx',
]

# Analysis

In [17]:
clients_labels_list = load_labels(clients_labels_file_name)

speaker_1_audio_paths, speaker_2_audio_paths = get_audio_paths(clients_labels_list[:10], prepend_directories)

In [22]:
df = pd.DataFrame(
    {
        "speaker_1_audio_path" : speaker_1_audio_paths[:1000],
        "speaker_2_audio_path" : speaker_2_audio_paths[:1000],
    }
)

In [24]:
def get_speaker(audio_path):
    
    speaker_chunk = [chunk for chunk in audio_path.split("/") if chunk.startswith("id")]
    # Only consider directories with /id.../
    if len(speaker_chunk) > 0: 
        speaker = speaker_chunk[0]
    
    return speaker

In [25]:
def get_duration(audio_path):

    audio_duration = librosa.get_duration(filename = audio_path)
    
    return audio_duration

In [37]:
def replace_extension(path, new_extension):
    
    path = '.'.join(path.split(".")[:-1])
    path = f"{path}.{new_extension}"
    
    return path

In [27]:
df["speaker_1"] = df["speaker_1_audio_path"].apply(lambda x: get_speaker(x))
df["speaker_2"] = df["speaker_2_audio_path"].apply(lambda x: get_speaker(x))

In [39]:
df["speaker_1_feature_path"] = df["speaker_1_audio_path"].apply(lambda x: replace_extension(x, "pickle"))
df["speaker_2_feature_path"] = df["speaker_2_audio_path"].apply(lambda x: replace_extension(x, "pickle"))

In [29]:
%%time

df["duration_seconds_audio_speaker_1"] = df["speaker_1_audio_path"].apply(lambda x: get_duration(x))

df["duration_seconds_audio_speaker_2"] = df["speaker_2_audio_path"].apply(lambda x: get_duration(x))

CPU times: user 141 ms, sys: 54.8 ms, total: 196 ms
Wall time: 1.46 s


In [40]:
df.head()

Unnamed: 0,speaker_1_audio_path,speaker_2_audio_path,speaker_1,speaker_2,duration_seconds_audio_speaker_1,duration_seconds_audio_speaker_2,speaker_1_feature_path,speaker_2_feature_path
0,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,id08821,id08821,8.3,4.5,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...
1,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,id04274,id04274,12.3,11.2,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...
2,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,id08637,id08637,7.2,5.1,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...
3,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,id08860,id08860,4.0,11.7,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...
4,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,id01470,id01470,5.5,4.2,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...,/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCel...


In [31]:
checkpoint_path = '/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/models/22_12_14_09_48_59_vgg_tmh_ap_fc_VGGNL_TransformerStackedAttentionPooling_c0whzsyq/22_12_14_09_48_59_vgg_tmh_ap_fc_VGGNL_TransformerStackedAttentionPooling_c0whzsyq.chkpt'


In [32]:
device = "cuda" if torch.cuda.is_available() else "cpu"

checkpoint = torch.load(checkpoint_path, map_location = device)
params = checkpoint['settings']

net = SpeakerClassifier(params, device)

try:
    net.load_state_dict(checkpoint['model'])
except RuntimeError:    
    net.module.load_state_dict(checkpoint['model'])

In [46]:
df["speaker_1_feature_path"].iloc[0]

'/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCeleb2/dev/id08821/A1zaOlZpvdQ/00074.pickle'

In [47]:
df["speaker_2_feature_path"].iloc[0]

'/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCeleb2/dev/id08821/D0nlv29V-9c/00104.pickle'

In [48]:
def normalize(features, normalization = 'cmn'):

    # Cepstral mean normalization
    if normalization == 'cmn':

        # Compute the mean for each frequency band (columns)
        mean = np.mean(features, axis = 0)

        # Substract for each column the corresponding column mean
        features = features - mean

    # Cepstral mean and variance normalization
    elif normalization == 'cmvn':

        # Compute the mean for each frequency band (columns)
        mean = np.mean(features, axis = 0)

        # Substract for each column the corresponding column mean
        features = features - mean

        # Compute the standard deviation for each frequency band (columns)
        std = np.std(features, axis = 0)

        # HACK guess this is to avoid zero division overflow
        std = np.where(std > 0.01, std, 1.0)

        # Divide for each column the corresponding column std
        features = features / std

    return features

In [49]:
def get_feature_vector(utterance_path):

    # Load the spectrogram saved in pickle format
    with open(utterance_path, 'rb') as pickle_file:
        features_dict = pickle.load(pickle_file)

    features = features_dict["features"]
    features = np.transpose(features)
    features = normalize(features)
    
    return features

In [52]:
with torch.no_grad():

    # Switch torch to evaluation mode
    net.eval()
    
    input_1, input_2 = [
        get_feature_vector(df["speaker_1_feature_path"].iloc[0]), 
        get_feature_vector(df["speaker_1_feature_path"].iloc[1]),
    ]
    
    if torch.cuda.device_count() > 1:
        embedding_1 = net.module.get_embedding(input_1)
        embedding_2 = net.module.get_embedding(input_2)
    else:
        embedding_1 = net.get_embedding(input_1)
        embedding_2 = net.get_embedding(input_2)

TypeError: 'int' object is not callable